Converted assembly to use gcc instead of nasm, updated html man

author lindahl <lindahl>

Wed, 20 Jun 2001 10:33:59 +0000 (10:33 +0000)

committer lindahl <lindahl>

Wed, 20 Jun 2001 10:33:59 +0000 (10:33 +0000)
author lindahl <lindahl>
Wed, 20 Jun 2001 10:33:59 +0000 (10:33 +0000)
committer lindahl <lindahl>
Wed, 20 Jun 2001 10:33:59 +0000 (10:33 +0000)
diff --git a/.cvsignore b/.cvsignore

index 0c4286672f52dbc6ca7430f9231136989ee6ce46..4ba32f88dc776c38d3da57a65b7a4456dd3a17b3 100644 (file)
--- a/.cvsignore
+++ b/.cvsignore
@@ -3,3 +3,6 @@ lib
  obj
  Makefile.in
  include
+config.log
+config.cache
+libtool
diff --git a/Makefile.am b/Makefile.am

index 4fe038cd3f54cfe03fe064ce1e13f6282bd7f16e..388cfb34f6f475dcd664cfcf520fb5fa77310b04 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -11,10 +11,13 @@ SUBDIRS = src top html tutor man
  
  #
  # Most files in the config subdir is included automatically when
-# we issue "make dist", but not depcomp. This just includes that file...
-#
-EXTRA_DIST = config/depcomp
+# we issue "make dist", but some versions of automake seem to have
+# problems with it, so we include them all...
  
+EXTRA_DIST = config/depcomp    config/ltconfig         config/ltcf-c.sh   \
+             config/ltcf-f77.sh config/config.guess    config/config.sub  \
+            config/install-sh  config/missing          config/ltmain.sh   \
+             config/mkinstalldirs
  
  #
  # This is a shortcut to construct the mdrun executable by first
@@ -29,10 +32,15 @@ mdrun:
  install-mdrun:
         (cd ${top_builddir}/src/kernel && $(MAKE) install-mdrun)
  
+links:
+       (cd /usr/local/bin && $(LN_S) ${bindir}/* .)
+
  #
  # Apart from normal things like .o, things matching this are removed
  # (The second one removes files beginning with a #)
-CLEANFILES = *~ \\\#*
+CLEANFILES = *~ \\\#* 
+DISTCLEANFILES = libtool config.cache config.log
+
  
  
  
diff --git a/acinclude.m4 b/acinclude.m4

index 486ef35574b1038ab903aefb4557c3ee295571ec..5d299cdd1a3fca90a32ad6a32f6a7a87c7df4fe7 100644 (file)
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -31,7 +31,7 @@ AC_MSG_RESULT(no))
  
  
  if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
  AC_TRY_COMPILE([#include <$fftwname.h>],
  [int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)]; ],
  ok="yes",ok="no")
@@ -46,29 +46,40 @@ if test "$ok" != "yes"; then
  [
  AC_MSG_RESULT(no)
  AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org] 
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.] 
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself:                                        ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.] 
  [If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.])
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])
  ])
  AC_TRY_COMPILE([#include <$xfftwname.h>],[int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)];],
  [
  fftwname=$xfftwname 
  usedprefix=$fftwcheckprefix
  ],
+[
  AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org] 
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.] 
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself:                                       ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.] 
  [If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.]))
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])])
  fi
  
  AC_CHECK_LIB($fftwname,main,,AC_MSG_ERROR([Can't find a library to match the $fftwname header]))
@@ -227,11 +238,11 @@ then
  fi
  
  AC_ARG_WITH(motif-includes,
-[  --with-motif-includes=DIR    Motif include files are in DIR],
+[  --with-motif-includes=DIR     Motif include files are in DIR],
  motif_includes="$withval")
  
  AC_ARG_WITH(motif-libraries,
-[  --with-motif-libraries=DIR   Motif libraries are in DIR],
+[  --with-motif-libraries=DIR    Motif libraries are in DIR],
  motif_libraries="$withval")
  
  AC_MSG_CHECKING(for Motif)
@@ -537,6 +548,8 @@ AC_REQUIRE([AC_CANONICAL_HOST])
  # determine our suggested choices for both C and fortran, and then possibly
  # override them with user choices.
  
+cc_vendor="unknown"
+
  case "${host_cpu}-${host_os}" in
  
    *-solaris2*) 
@@ -707,6 +720,7 @@ case "${host_cpu}-${host_os}" in
      esac
      if $CC -V 2>  /dev/null | grep Compaq > /dev/null 2>&1; then
        xCFLAGS="$tmpCFLAGS"
+      cc_vendor="Compaq"
      fi
      if test "$enable_fortran" = "yes"; then
        if $F77 -V 2>  /dev/null | grep Compaq > /dev/null 2>&1; then
@@ -730,7 +744,7 @@ case "${host_cpu}-${host_os}" in
        xCFLAGS="$pgiopt -fast -Minfo=loop -pc 32"
      fi
      if test "$enable_fortran" = "yes"; then
-      if $F77 -V 2>  /dev/null | grep Portland /dev/null 2>&1; then
+      if $F77 -V 2>  /dev/null | grep Portland > /dev/null 2>&1; then
         xFFLAGS="$xCFLAGS -Mneginfo=loop"
        fi       
      fi
@@ -754,7 +768,9 @@ if test $enable_fortran = yes; then
  fi
  
  CPU_FLAGS=""
+
  if test "$GCC" = "yes"; then
+  AM_CONDITIONAL(GNU_CC,true)
    # try to guess correct CPU flags, at least for linux
    case "${host_cpu}" in
      # i586/i686 cpu flags don't improve speed, thus no need to use them.
@@ -775,6 +791,8 @@ if test "$GCC" = "yes"; then
         ACX_CHECK_CC_FLAGS(-mpowerpc,m_powerpc,CPU_FLAGS=-mpowerpc)
        fi
     esac
+else
+  AM_CONDITIONAL(GNU_CC,false)
  fi
  
  if test -n "$CPU_FLAGS"; then
@@ -795,7 +813,7 @@ if test "$ac_test_CFLAGS" != "set"; then
      echo "*******************************************************************"
      echo "* WARNING: No special optimization settings found for the C       *"
      echo "* compiler. Use  make CFLAGS=..., or edit the top level Makefile. *"
-    echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it)  *"
+    echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it!)*"
      echo "*******************************************************************"
      CFLAGS="-O3"
    fi
@@ -815,12 +833,11 @@ fi
  if test "$enable_fortran" = "yes"; then        
    if test "$ac_test_FFLAGS" != "set"; then
      FFLAGS="$xFFLAGS"
-    
      if test -z "$FFLAGS"; then
        echo "*******************************************************************"
        echo "* WARNING: No special optimization settings found for the fortran *"
        echo "* compiler. Use  make FFLAGS=..., or edit the top level Makefile. *"
-      echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it) *"
+      echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it!)*"
        echo "*******************************************************************"
        FFLAGS="-O3"
      fi
@@ -837,10 +854,898 @@ if test "$enable_fortran" = "yes"; then
      echo "******************************************"
    fi
  fi
+  
+])
+
+
+
+
+
+
+## libtool.m4 - Configure libtool for the host system. -*-Shell-script-*-
+## Copyright 1996, 1997, 1998, 1999, 2000, 2001
+## Free Software Foundation, Inc.
+## Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+## General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+##
+## As a special exception to the GNU General Public License, if you
+## distribute this file as part of a program that contains a
+## configuration script generated by Autoconf, you may include it under
+## the same distribution terms that you use for the rest of that program.
+
+# serial 46 AC_PROG_LIBTOOL
+AC_DEFUN([AC_PROG_LIBTOOL],
+[AC_REQUIRE([_AC_PROG_LIBTOOL])dnl
+dnl If AC_PROG_CXX has already been expanded, run AC_LIBTOOL_CXX
+dnl immediately, otherwise, hook it in at the end of AC_PROG_CXX.
+  AC_PROVIDE_IFELSE([AC_PROG_CXX],
+    [AC_LIBTOOL_CXX],
+    [define([AC_PROG_CXX], defn([AC_PROG_CXX])[AC_LIBTOOL_CXX
+])])
+  AC_PROVIDE_IFELSE([AC_PROG_F77],
+    [AC_LIBTOOL_F77],
+    [define([AC_PROG_F77], defn([AC_PROG_F77])[AC_LIBTOOL_F77
+])])
+
+dnl Quote A][M_PROG_GCJ so that aclocal doesn't bring it in needlessly.
+dnl If either AC_PROG_GCJ or A][M_PROG_GCJ have already been expanded, run
+dnl AC_LIBTOOL_GCJ immediately, otherwise, hook it in at the end of both.
+  AC_PROVIDE_IFELSE([AC_PROG_GCJ],
+    [AC_LIBTOOL_GCJ],
+    [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],
+        [AC_LIBTOOL_GCJ],
+       [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],
+         [AC_LIBTOOL_GCJ],
+       [ifdef([AC_PROG_GCJ],
+              [define([AC_PROG_GCJ], defn([AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+        ifdef([A][M_PROG_GCJ],
+              [define([A][M_PROG_GCJ], defn([A][M_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+        ifdef([LT_AC_PROG_GCJ],
+              [define([LT_AC_PROG_GCJ], defn([LT_AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])])])])])
+
+AC_DEFUN([_AC_PROG_LIBTOOL],
+[AC_REQUIRE([AC_LIBTOOL_SETUP])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_CXX])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_GCJ])dnl
+
+# Save cache, so that ltconfig can load it
+AC_CACHE_SAVE
+
+# Actually configure libtool.  ac_aux_dir is where install-sh is found.
+AR="$AR" LTCC="$CC" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $host \
+|| AC_MSG_ERROR([libtool configure failed])
+
+# Reload cache, that may have been modified by ltconfig
+AC_CACHE_LOAD
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh $ac_aux_dir/ltcf-c.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+AC_DEFUN([AC_LIBTOOL_SETUP],
+[AC_PREREQ(2.13)dnl
+AC_REQUIRE([AC_ENABLE_SHARED])dnl
+AC_REQUIRE([AC_ENABLE_STATIC])dnl
+AC_REQUIRE([AC_ENABLE_FAST_INSTALL])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_LD])dnl
+AC_REQUIRE([AC_PROG_LD_RELOAD_FLAG])dnl
+AC_REQUIRE([AC_PROG_NM])dnl
+AC_REQUIRE([AC_PROG_LN_S])dnl
+AC_REQUIRE([AC_DEPLIBS_CHECK_METHOD])dnl
+# Autoconf 2.13's AC_OBJEXT and AC_EXEEXT macros only works for C compilers!
+AC_REQUIRE([AC_OBJEXT])dnl
+AC_REQUIRE([AC_EXEEXT])dnl
+dnl
+
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+    AC_PATH_MAGIC
+  fi
+  ;;
+esac
+
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$GCC" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$lt_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+ifdef([AC_PROVIDE_AC_LIBTOOL_DLOPEN],
+[libtool_flags="$libtool_flags --enable-dlopen"])
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[libtool_flags="$libtool_flags --enable-win32-dll"])
+AC_ARG_ENABLE(libtool-lock,
+  [  --disable-libtool-lock        avoid locking (might break parallel builds)])
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+AC_ARG_WITH(pic,
+  [  --with-pic                    try to use only PIC/non-PIC [default=both]],
+     pic_mode="$withval", pic_mode=default)
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case `/usr/bin/file conftest.$ac_objext` in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+    [AC_LANG_SAVE
+     AC_LANG_C
+     AC_TRY_LINK([],[],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+     AC_LANG_RESTORE])
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[*-*-cygwin* | *-*-mingw* | *-*-pw32*)
+  AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+  AC_CHECK_TOOL(AS, as, false)
+  AC_CHECK_TOOL(OBJDUMP, objdump, false)
+
+  # recent cygwin and mingw systems supply a stub DllMain which the user
+  # can override, but on older systems we have to supply one
+  AC_CACHE_CHECK([if libtool should supply DllMain function], lt_cv_need_dllmain,
+    [AC_TRY_LINK([],
+      [extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+      DllMain (0, 0, 0);],
+      [lt_cv_need_dllmain=no],[lt_cv_need_dllmain=yes])])
+
+  case $host/$CC in
+  *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+    # old mingw systems require "-dll" to link a DLL, while more recent ones
+    # require "-mdll"
+    SAVE_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS -mdll"
+    AC_CACHE_CHECK([how to link DLLs], lt_cv_cc_dll_switch,
+      [AC_TRY_LINK([], [], [lt_cv_cc_dll_switch=-mdll],[lt_cv_cc_dll_switch=-dll])])
+    CFLAGS="$SAVE_CFLAGS" ;;
+  *-*-cygwin* | *-*-pw32*)
+    # cygwin systems need to pass --dll to the linker, and not link
+    # crt.o which will require a WinMain@16 definition.
+    lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+  esac
+  ;;
+  ])
+esac
+])
+
+# AC_LIBTOOL_DLOPEN - enable checks for dlopen support
+AC_DEFUN([AC_LIBTOOL_DLOPEN], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])])
+
+# AC_LIBTOOL_WIN32_DLL - declare package support for building win32 dll's
+AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [AC_BEFORE([$0], [AC_LIBTOOL_SETUP])])
+
+# AC_ENABLE_SHARED - implement the --enable-shared flag
+# Usage: AC_ENABLE_SHARED[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_SHARED],
+[define([AC_ENABLE_SHARED_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(shared,
+changequote(<<, >>)dnl
+<<  --enable-shared[=PKGS]        build shared libraries [default=>>AC_ENABLE_SHARED_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_shared=AC_ENABLE_SHARED_DEFAULT)dnl
+])
+
+# AC_DISABLE_SHARED - set the default shared flag to --disable-shared
+AC_DEFUN([AC_DISABLE_SHARED], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_SHARED(no)])
+
+# AC_ENABLE_STATIC - implement the --enable-static flag
+# Usage: AC_ENABLE_STATIC[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_STATIC],
+[define([AC_ENABLE_STATIC_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(static,
+changequote(<<, >>)dnl
+<<  --enable-static[=PKGS]        build static libraries [default=>>AC_ENABLE_STATIC_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_static=AC_ENABLE_STATIC_DEFAULT)dnl
+])
+
+# AC_DISABLE_STATIC - set the default static flag to --disable-static
+AC_DEFUN([AC_DISABLE_STATIC],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_STATIC(no)])
+
+
+# AC_ENABLE_FAST_INSTALL - implement the --enable-fast-install flag
+# Usage: AC_ENABLE_FAST_INSTALL[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_FAST_INSTALL],
+[define([AC_ENABLE_FAST_INSTALL_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(fast-install,
+changequote(<<, >>)dnl
+<<  --enable-fast-install[=PKGS]  optimize for fast installation [default=>>AC_ENABLE_FAST_INSTALL_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_fast_install=AC_ENABLE_FAST_INSTALL_DEFAULT)dnl
+])
+
+# AC_DISABLE_FAST_INSTALL - set the default to --disable-fast-install
+AC_DEFUN([AC_DISABLE_FAST_INSTALL],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_FAST_INSTALL(no)])
+
+# AC_LIBTOOL_PICMODE - implement the --with-pic flag
+# Usage: AC_LIBTOOL_PICMODE[(MODE)]
+#   Where MODE is either `yes' or `no'.  If omitted, it defaults to
+#   `both'.
+AC_DEFUN([AC_LIBTOOL_PICMODE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+pic_mode=ifelse($#,1,$1,default)])
+
+
+# AC_PATH_TOOL_PREFIX - find a file program which can recognise shared library
+AC_DEFUN([AC_PATH_TOOL_PREFIX],
+[AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
+[case $MAGIC_CMD in
+  /*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC_CMD="$MAGIC_CMD"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+  ac_dummy="ifelse([$2], , $PATH, [$2])"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$1; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/$1"
+      if test -n "$file_magic_test_file"; then
+       case $deplibs_check_method in
+       "file_magic "*)
+         file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+         MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+         if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+           egrep "$file_magic_regex" > /dev/null; then
+           :
+         else
+           cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+         fi ;;
+       esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC_CMD="$ac_save_MAGIC_CMD"
+  ;;
+esac])
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  AC_MSG_RESULT($MAGIC_CMD)
+else
+  AC_MSG_RESULT(no)
+fi
  ])
  
  
+# AC_PATH_MAGIC - find a file program which can recognise a shared library
+AC_DEFUN([AC_PATH_MAGIC],
+[AC_REQUIRE([AC_CHECK_TOOL_PREFIX])dnl
+AC_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin:$PATH)
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+  if test -n "$ac_tool_prefix"; then
+    AC_PATH_TOOL_PREFIX(file, /usr/bin:$PATH)
+  else
+    MAGIC_CMD=:
+  fi
+fi
+])
  
  
+# AC_PROG_LD - find the path to the GNU or non-GNU linker
+AC_DEFUN([AC_PROG_LD],
+[AC_ARG_WITH(gnu-ld,
+[  --with-gnu-ld                 assume the C compiler uses GNU ld [default=no]],
+test "$withval" = no || with_gnu_ld=yes, with_gnu_ld=no)
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  AC_MSG_CHECKING([for ld used by GCC])
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [[\\/]* | [A-Za-z]:[\\/]*)]
+      re_direlt=['/[^/][^/]*/\.\./']
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+       ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  AC_MSG_CHECKING([for GNU ld])
+else
+  AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(lt_cv_path_LD,
+[if test -z "$LD"; then
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+       test "$with_gnu_ld" != no && break
+      else
+       test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  AC_MSG_RESULT($LD)
+else
+  AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+AC_PROG_LD_GNU
+])
  
+AC_DEFUN([AC_PROG_LD_GNU],
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], lt_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  lt_cv_prog_gnu_ld=yes
+else
+  lt_cv_prog_gnu_ld=no
+fi])
+with_gnu_ld=$lt_cv_prog_gnu_ld
+])
  
+# AC_PROG_LD_RELOAD_FLAG - find reload flag for linker
+#   -- PORTME Some linkers may need a different reload flag.
+AC_DEFUN([AC_PROG_LD_RELOAD_FLAG],
+[AC_CACHE_CHECK([for $LD option to reload object files], lt_cv_ld_reload_flag,
+[lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+])
+
+# AC_DEPLIBS_CHECK_METHOD - how to check for library dependencies
+#  -- PORTME fill in with the dynamic library characteristics
+AC_DEFUN([AC_DEPLIBS_CHECK_METHOD],
+[AC_CACHE_CHECK([how to recognise dependant libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+  lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)']
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw* |pw32*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='$OBJDUMP -f'
+  ;;
+
+darwin* | rhapsody*)
+  lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  case "$host_os" in
+  rhapsody* | darwin1.[012])
+    lt_cv_file_magic_test_file='/System/Library/Frameworks/System.framework/System'
+    ;;
+  *) # Darwin 1.3 on
+    lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+    ;;
+  esac
+  ;;
+
+freebsd* )
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    case $host_cpu in
+    i*86 )
+      # Not sure whether the presence of OpenBSD here was a mistake.
+      # Let's accept both of them until this is cleared up.
+      lt_cv_deplibs_check_method=['file_magic (FreeBSD|OpenBSD)/i[3-9]86 (compact )?demand paged shared library']
+      lt_cv_file_magic_cmd=/usr/bin/file
+      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+      ;;
+    esac
+  else
+    lt_cv_deplibs_check_method=pass_all
+  fi
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+hpux10.20*|hpux11*)
+  lt_cv_deplibs_check_method=['file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9].[0-9]) shared library']
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libc.sl
+  ;;
+
+irix5* | irix6*)
+  case $host_os in
+  irix5*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case $LD in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method=["file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"]
+    ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case $host_cpu in
+  alpha* | hppa* | i*86 | powerpc* | sparc* | ia64* )
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+    lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )'] ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+netbsd*)
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so\.[0-9]+\.[0-9]+$']
+  else
+    [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so$']
+  fi
+  ;;
+
+newsos6)
+  [lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)']
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libnls.so
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+[sysv5uw[78]* | sysv4*uw2*)]
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case $host_vendor in
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  motorola)
+    lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]']
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  esac
+  ;;
+esac
+])
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+])
+
+
+# AC_PROG_NM - find the path to a BSD-compatible name lister
+AC_DEFUN([AC_PROG_NM],
+[AC_MSG_CHECKING([for BSD-compatible nm])
+AC_CACHE_VAL(lt_cv_path_NM,
+[if test -n "$NM"; then
+  # Let the user override the test.
+  lt_cv_path_NM="$NM"
+else
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    tmp_nm=$ac_dir/${ac_tool_prefix}nm
+    if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      # Tru64's nm complains that /dev/null is an invalid object file
+      if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+       lt_cv_path_NM="$tmp_nm -B"
+       break
+      elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+       lt_cv_path_NM="$tmp_nm -p"
+       break
+      else
+       lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+       continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi])
+NM="$lt_cv_path_NM"
+AC_MSG_RESULT([$NM])
+])
+
+# AC_CHECK_LIBM - check for math library
+AC_DEFUN([AC_CHECK_LIBM],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case $host in
+*-*-beos* | *-*-cygwin* | *-*-pw32*)
+  # These system don't have libm
+  ;;
+*-ncr-sysv4.3*)
+  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+  AC_CHECK_LIB(m, main, LIBM="$LIBM -lm")
+  ;;
+*)
+  AC_CHECK_LIB(m, main, LIBM="-lm")
+  ;;
+esac
+])
+
+# AC_LIBLTDL_CONVENIENCE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl convenience library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-convenience to the
+# configure arguments.  Note that LIBLTDL and INCLTDL are not
+# AC_SUBSTed, nor is AC_CONFIG_SUBDIRS called.  If DIR is not
+# provided, it is assumed to be `libltdl'.  LIBLTDL will be prefixed
+# with '${top_builddir}/' and INCLTDL will be prefixed with
+# '${top_srcdir}/' (note the single quotes!).  If your package is not
+# flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+AC_DEFUN([AC_LIBLTDL_CONVENIENCE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  case $enable_ltdl_convenience in
+  no) AC_MSG_ERROR([this package needs a convenience libltdl]) ;;
+  "") enable_ltdl_convenience=yes
+      ac_configure_args="$ac_configure_args --enable-ltdl-convenience" ;;
+  esac
+  LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdlc.la
+  INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+])
+
+# AC_LIBLTDL_INSTALLABLE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl installable library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-install to the configure
+# arguments.  Note that LIBLTDL and INCLTDL are not AC_SUBSTed, nor is
+# AC_CONFIG_SUBDIRS called.  If DIR is not provided and an installed
+# libltdl is not found, it is assumed to be `libltdl'.  LIBLTDL will
+# be prefixed with '${top_builddir}/' and INCLTDL will be prefixed
+# with '${top_srcdir}/' (note the single quotes!).  If your package is
+# not flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+# In the future, this macro may have to be called after AC_PROG_LIBTOOL.
+AC_DEFUN([AC_LIBLTDL_INSTALLABLE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  AC_CHECK_LIB(ltdl, main,
+  [test x"$enable_ltdl_install" != xyes && enable_ltdl_install=no],
+  [if test x"$enable_ltdl_install" = xno; then
+     AC_MSG_WARN([libltdl not installed, but installation disabled])
+   else
+     enable_ltdl_install=yes
+   fi
+  ])
+  if test x"$enable_ltdl_install" = x"yes"; then
+    ac_configure_args="$ac_configure_args --enable-ltdl-install"
+    LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdl.la
+    INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+  else
+    ac_configure_args="$ac_configure_args --enable-ltdl-install=no"
+    LIBLTDL="-lltdl"
+    INCLTDL=
+  fi
+])
+
+# If this macro is not defined by Autoconf, define it here.
+ifdef([AC_PROVIDE_IFELSE],
+      [],
+      [define([AC_PROVIDE_IFELSE],
+              [ifdef([AC_PROVIDE_$1],
+                     [$2], [$3])])])
+
+# AC_LIBTOOL_F77 - enable support for fortran libraries
+AC_DEFUN([AC_LIBTOOL_F77], [AC_REQUIRE([_AC_LIBTOOL_F77])])
+
+AC_DEFUN([_AC_LIBTOOL_F77],
+[AC_REQUIRE([AC_PROG_F77])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-f77.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the fortran compiler.
+AR="$AR" LTCC="$CC" CC="$F77" F77="$F77" CFLAGS="$FFLAGS" CPPFLAGS="" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=F77 $ac_aux_dir/ltcf-f77.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+# AC_LIBTOOL_CXX - enable support for C++ libraries
+AC_DEFUN([AC_LIBTOOL_CXX], [AC_REQUIRE([_AC_LIBTOOL_CXX])])
+
+AC_DEFUN([_AC_LIBTOOL_CXX],
+[AC_REQUIRE([AC_PROG_CXX])
+AC_REQUIRE([AC_PROG_CXXCPP])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-cxx.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$CXX" CXX="$CXX" CFLAGS="$CXXFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=CXX $ac_aux_dir/ltcf-cxx.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+# AC_LIBTOOL_GCJ - enable support for GCJ libraries
+AC_DEFUN([AC_LIBTOOL_GCJ],[AC_REQUIRE([_AC_LIBTOOL_GCJ])])
+
+AC_DEFUN([_AC_LIBTOOL_GCJ],
+[AC_REQUIRE([AC_PROG_LIBTOOL])
+AC_PROVIDE_IFELSE([AC_PROG_GCJ],[],
+  [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],[],
+    [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],[],
+      [ifdef([AC_PROG_GCJ],[AC_REQUIRE([AC_PROG_GCJ])],
+         [ifdef([A][M_PROG_GCJ],[AC_REQUIRE([A][M_PROG_GCJ])],
+           [AC_REQUIRE([A][C_PROG_GCJ_OR_A][M_PROG_GCJ])])])])])])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-gcj.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$GCJ" CFLAGS="$GCJFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=GCJ $ac_aux_dir/ltcf-gcj.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+dnl old names
+AC_DEFUN([AM_PROG_LIBTOOL],   [AC_PROG_LIBTOOL])
+AC_DEFUN([AM_ENABLE_SHARED],  [AC_ENABLE_SHARED($@)])
+AC_DEFUN([AM_ENABLE_STATIC],  [AC_ENABLE_STATIC($@)])
+AC_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
+AC_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
+AC_DEFUN([AM_PROG_LD],        [AC_PROG_LD])
+AC_DEFUN([AM_PROG_NM],        [AC_PROG_NM])
+
+dnl This is just to silence aclocal about the macro not being used
+ifelse([AC_DISABLE_FAST_INSTALL])dnl
+ifelse([AC_DISABLE_SHARED])dnl
+
+AC_DEFUN([LT_AC_PROG_GCJ],
+[AC_CHECK_TOOL(GCJ, gcj, no)
+  test "x${GCJFLAGS+set}" = xset || GCJFLAGS="-g -O2"
+  AC_SUBST(GCJFLAGS)
+])
diff --git a/aclocal.m4 b/aclocal.m4

index 6649ec70d49510d247e0e17424979b224ef7ebf5..5f5384193048d2efc1d3923c16a41ae0b9e04f80 100644 (file)
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,15 +1,14 @@
-# aclocal.m4 generated automatically by aclocal 1.4d
+dnl aclocal.m4 generated automatically by aclocal 1.4
  
-# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000
-# Free Software Foundation, Inc.
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
+dnl Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
  
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+dnl even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+dnl PARTICULAR PURPOSE.
  
   
  # ACX_CHECK_FFTW()
@@ -44,7 +43,7 @@ AC_MSG_RESULT(no))
  
  
  if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
  AC_TRY_COMPILE([#include <$fftwname.h>],
  [int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)]; ],
  ok="yes",ok="no")
@@ -59,29 +58,40 @@ if test "$ok" != "yes"; then
  [
  AC_MSG_RESULT(no)
  AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org] 
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.] 
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself:                                        ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.] 
  [If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.])
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])
  ])
  AC_TRY_COMPILE([#include <$xfftwname.h>],[int _array_ [1 - 2 * !((sizeof(fftw_real)) == $2)];],
  [
  fftwname=$xfftwname 
  usedprefix=$fftwcheckprefix
  ],
+[
  AC_MSG_ERROR([Cannot find any $prec precision $fftwname.h or $xfftwname.h]
-[Do you have $prec precision FFTW installed? You can find it at www.fftw.org] 
-[Note that the default FFTW setup is double precision. You change the]
-[FFTW configuration to single with --enable-float and turn on MPI support]
-[with --enable-mpi. It is a good idea to install both single & double.] 
+[Do you have $prec precision FFTW installed? If you are using packages,]
+[note that you also need fftw-devel to compile GROMACS. You can find the ]
+[software at www.fftw.org, and detailed instructions at www.gromacs.org.]
+[If you compiled FFTW yourself:                                       ]
+[Note that the default FFTW setup is double precision. Change the FFTW]
+[configuration to single with --enable-float. If you want MPI support,]
+[use --enable-mpi. It is a good idea to install both single & double.] 
  [If your sysadm doesn't want to install it you can do it to a location]
-[in your home directory and provide Gromacs configure with the correct]
-[paths by setting the CPPFLAGS and LDFLAGS environment variables.]
-[Check the Gromacs INSTALL file for additional information.]))
+[in your home directory and provide the correct paths in the CPPFLAGS]
+[and LDFLAGS environment variables before running configure.]
+[That is also necessary to do if your compiler doesn't search]
+[/usr/local/include and /usr/local/lib by default.]
+[You can find information at www.gromacs.org, or in the INSTALL file.])])
  fi
  
  AC_CHECK_LIB($fftwname,main,,AC_MSG_ERROR([Can't find a library to match the $fftwname header]))
@@ -240,11 +250,11 @@ then
  fi
  
  AC_ARG_WITH(motif-includes,
-[  --with-motif-includes=DIR    Motif include files are in DIR],
+[  --with-motif-includes=DIR     Motif include files are in DIR],
  motif_includes="$withval")
  
  AC_ARG_WITH(motif-libraries,
-[  --with-motif-libraries=DIR   Motif libraries are in DIR],
+[  --with-motif-libraries=DIR    Motif libraries are in DIR],
  motif_libraries="$withval")
  
  AC_MSG_CHECKING(for Motif)
@@ -548,6 +558,8 @@ AC_REQUIRE([AC_CANONICAL_HOST])
  # determine our suggested choices for both C and fortran, and then possibly
  # override them with user choices.
  
+cc_vendor="unknown"
+
  case "${host_cpu}-${host_os}" in
  
    *-solaris2*) 
@@ -718,6 +730,7 @@ case "${host_cpu}-${host_os}" in
      esac
      if $CC -V 2>  /dev/null | grep Compaq > /dev/null 2>&1; then
        xCFLAGS="$tmpCFLAGS"
+      cc_vendor="Compaq"
      fi
      if test "$enable_fortran" = "yes"; then
        if $F77 -V 2>  /dev/null | grep Compaq > /dev/null 2>&1; then
@@ -741,7 +754,7 @@ case "${host_cpu}-${host_os}" in
        xCFLAGS="$pgiopt -fast -Minfo=loop -pc 32"
      fi
      if test "$enable_fortran" = "yes"; then
-      if $F77 -V 2>  /dev/null | grep Portland /dev/null 2>&1; then
+      if $F77 -V 2>  /dev/null | grep Portland > /dev/null 2>&1; then
         xFFLAGS="$xCFLAGS -Mneginfo=loop"
        fi       
      fi
@@ -765,7 +778,9 @@ if test $enable_fortran = yes; then
  fi
  
  CPU_FLAGS=""
+
  if test "$GCC" = "yes"; then
+  AM_CONDITIONAL(GNU_CC,true)
    # try to guess correct CPU flags, at least for linux
    case "${host_cpu}" in
      # i586/i686 cpu flags don't improve speed, thus no need to use them.
@@ -786,6 +801,8 @@ if test "$GCC" = "yes"; then
         ACX_CHECK_CC_FLAGS(-mpowerpc,m_powerpc,CPU_FLAGS=-mpowerpc)
        fi
     esac
+else
+  AM_CONDITIONAL(GNU_CC,false)
  fi
  
  if test -n "$CPU_FLAGS"; then
@@ -806,7 +823,7 @@ if test "$ac_test_CFLAGS" != "set"; then
      echo "*******************************************************************"
      echo "* WARNING: No special optimization settings found for the C       *"
      echo "* compiler. Use  make CFLAGS=..., or edit the top level Makefile. *"
-    echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it)  *"
+    echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it!)*"
      echo "*******************************************************************"
      CFLAGS="-O3"
    fi
@@ -826,12 +843,11 @@ fi
  if test "$enable_fortran" = "yes"; then        
    if test "$ac_test_FFLAGS" != "set"; then
      FFLAGS="$xFFLAGS"
-    
      if test -z "$FFLAGS"; then
        echo "*******************************************************************"
        echo "* WARNING: No special optimization settings found for the fortran *"
        echo "* compiler. Use  make FFLAGS=..., or edit the top level Makefile. *"
-      echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it) *"
+      echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it!)*"
        echo "*******************************************************************"
        FFLAGS="-O3"
      fi
@@ -848,6 +864,7 @@ if test "$enable_fortran" = "yes"; then
      echo "******************************************"
    fi
  fi
+  
  ])
  
  
@@ -856,28 +873,753 @@ fi
  
  
  
-# Do all the work for Automake.  This macro actually does too much --
-# some checks are only needed if your package does certain things.
-# But this isn't really a big deal.
+# serial 46 AC_PROG_LIBTOOL
+AC_DEFUN([AC_PROG_LIBTOOL],
+[AC_REQUIRE([_AC_PROG_LIBTOOL])dnl
+dnl If AC_PROG_CXX has already been expanded, run AC_LIBTOOL_CXX
+dnl immediately, otherwise, hook it in at the end of AC_PROG_CXX.
+  AC_PROVIDE_IFELSE([AC_PROG_CXX],
+    [AC_LIBTOOL_CXX],
+    [define([AC_PROG_CXX], defn([AC_PROG_CXX])[AC_LIBTOOL_CXX
+])])
+  AC_PROVIDE_IFELSE([AC_PROG_F77],
+    [AC_LIBTOOL_F77],
+    [define([AC_PROG_F77], defn([AC_PROG_F77])[AC_LIBTOOL_F77
+])])
+
+dnl Quote A][M_PROG_GCJ so that aclocal doesn't bring it in needlessly.
+dnl If either AC_PROG_GCJ or A][M_PROG_GCJ have already been expanded, run
+dnl AC_LIBTOOL_GCJ immediately, otherwise, hook it in at the end of both.
+  AC_PROVIDE_IFELSE([AC_PROG_GCJ],
+    [AC_LIBTOOL_GCJ],
+    [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],
+        [AC_LIBTOOL_GCJ],
+       [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],
+         [AC_LIBTOOL_GCJ],
+       [ifdef([AC_PROG_GCJ],
+              [define([AC_PROG_GCJ], defn([AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+        ifdef([A][M_PROG_GCJ],
+              [define([A][M_PROG_GCJ], defn([A][M_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])
+        ifdef([LT_AC_PROG_GCJ],
+              [define([LT_AC_PROG_GCJ], defn([LT_AC_PROG_GCJ])[AC_LIBTOOL_GCJ
+])])])])])])
+
+AC_DEFUN([_AC_PROG_LIBTOOL],
+[AC_REQUIRE([AC_LIBTOOL_SETUP])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_CXX])dnl
+AC_BEFORE([$0],[AC_LIBTOOL_GCJ])dnl
+
+# Save cache, so that ltconfig can load it
+AC_CACHE_SAVE
+
+# Actually configure libtool.  ac_aux_dir is where install-sh is found.
+AR="$AR" LTCC="$CC" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $host \
+|| AC_MSG_ERROR([libtool configure failed])
+
+# Reload cache, that may have been modified by ltconfig
+AC_CACHE_LOAD
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh $ac_aux_dir/ltcf-c.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+AC_DEFUN([AC_LIBTOOL_SETUP],
+[AC_PREREQ(2.13)dnl
+AC_REQUIRE([AC_ENABLE_SHARED])dnl
+AC_REQUIRE([AC_ENABLE_STATIC])dnl
+AC_REQUIRE([AC_ENABLE_FAST_INSTALL])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_LD])dnl
+AC_REQUIRE([AC_PROG_LD_RELOAD_FLAG])dnl
+AC_REQUIRE([AC_PROG_NM])dnl
+AC_REQUIRE([AC_PROG_LN_S])dnl
+AC_REQUIRE([AC_DEPLIBS_CHECK_METHOD])dnl
+# Autoconf 2.13's AC_OBJEXT and AC_EXEEXT macros only works for C compilers!
+AC_REQUIRE([AC_OBJEXT])dnl
+AC_REQUIRE([AC_EXEEXT])dnl
+dnl
+
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+    AC_PATH_MAGIC
+  fi
+  ;;
+esac
  
-# serial 5
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$GCC" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$lt_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+ifdef([AC_PROVIDE_AC_LIBTOOL_DLOPEN],
+[libtool_flags="$libtool_flags --enable-dlopen"])
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[libtool_flags="$libtool_flags --enable-win32-dll"])
+AC_ARG_ENABLE(libtool-lock,
+  [  --disable-libtool-lock        avoid locking (might break parallel builds)])
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+AC_ARG_WITH(pic,
+  [  --with-pic                    try to use only PIC/non-PIC [default=both]],
+     pic_mode="$withval", pic_mode=default)
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case `/usr/bin/file conftest.$ac_objext` in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
  
-# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
-# written in clear, in which case automake, when reading aclocal.m4,
-# will think it sees a *use*, and therefore will trigger all it's
-# C support machinery.  Also note that it means that autoscan, seeing
-# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+    [AC_LANG_SAVE
+     AC_LANG_C
+     AC_TRY_LINK([],[],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+     AC_LANG_RESTORE])
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
  
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[*-*-cygwin* | *-*-mingw* | *-*-pw32*)
+  AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+  AC_CHECK_TOOL(AS, as, false)
+  AC_CHECK_TOOL(OBJDUMP, objdump, false)
+
+  # recent cygwin and mingw systems supply a stub DllMain which the user
+  # can override, but on older systems we have to supply one
+  AC_CACHE_CHECK([if libtool should supply DllMain function], lt_cv_need_dllmain,
+    [AC_TRY_LINK([],
+      [extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+      DllMain (0, 0, 0);],
+      [lt_cv_need_dllmain=no],[lt_cv_need_dllmain=yes])])
+
+  case $host/$CC in
+  *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+    # old mingw systems require "-dll" to link a DLL, while more recent ones
+    # require "-mdll"
+    SAVE_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS -mdll"
+    AC_CACHE_CHECK([how to link DLLs], lt_cv_cc_dll_switch,
+      [AC_TRY_LINK([], [], [lt_cv_cc_dll_switch=-mdll],[lt_cv_cc_dll_switch=-dll])])
+    CFLAGS="$SAVE_CFLAGS" ;;
+  *-*-cygwin* | *-*-pw32*)
+    # cygwin systems need to pass --dll to the linker, and not link
+    # crt.o which will require a WinMain@16 definition.
+    lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+  esac
+  ;;
+  ])
+esac
+])
  
-# We require 2.13 because we rely on SHELL being computed by configure.
-AC_PREREQ([2.13])
+# AC_LIBTOOL_DLOPEN - enable checks for dlopen support
+AC_DEFUN([AC_LIBTOOL_DLOPEN], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])])
+
+# AC_LIBTOOL_WIN32_DLL - declare package support for building win32 dll's
+AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [AC_BEFORE([$0], [AC_LIBTOOL_SETUP])])
+
+# AC_ENABLE_SHARED - implement the --enable-shared flag
+# Usage: AC_ENABLE_SHARED[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_SHARED],
+[define([AC_ENABLE_SHARED_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(shared,
+changequote(<<, >>)dnl
+<<  --enable-shared[=PKGS]        build shared libraries [default=>>AC_ENABLE_SHARED_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_shared=AC_ENABLE_SHARED_DEFAULT)dnl
+])
+
+# AC_DISABLE_SHARED - set the default shared flag to --disable-shared
+AC_DEFUN([AC_DISABLE_SHARED], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_SHARED(no)])
+
+# AC_ENABLE_STATIC - implement the --enable-static flag
+# Usage: AC_ENABLE_STATIC[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_STATIC],
+[define([AC_ENABLE_STATIC_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(static,
+changequote(<<, >>)dnl
+<<  --enable-static[=PKGS]        build static libraries [default=>>AC_ENABLE_STATIC_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_static=AC_ENABLE_STATIC_DEFAULT)dnl
+])
+
+# AC_DISABLE_STATIC - set the default static flag to --disable-static
+AC_DEFUN([AC_DISABLE_STATIC],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_STATIC(no)])
+
+
+# AC_ENABLE_FAST_INSTALL - implement the --enable-fast-install flag
+# Usage: AC_ENABLE_FAST_INSTALL[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_FAST_INSTALL],
+[define([AC_ENABLE_FAST_INSTALL_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(fast-install,
+changequote(<<, >>)dnl
+<<  --enable-fast-install[=PKGS]  optimize for fast installation [default=>>AC_ENABLE_FAST_INSTALL_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_fast_install=AC_ENABLE_FAST_INSTALL_DEFAULT)dnl
+])
+
+# AC_DISABLE_FAST_INSTALL - set the default to --disable-fast-install
+AC_DEFUN([AC_DISABLE_FAST_INSTALL],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_FAST_INSTALL(no)])
+
+# AC_LIBTOOL_PICMODE - implement the --with-pic flag
+# Usage: AC_LIBTOOL_PICMODE[(MODE)]
+#   Where MODE is either `yes' or `no'.  If omitted, it defaults to
+#   `both'.
+AC_DEFUN([AC_LIBTOOL_PICMODE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+pic_mode=ifelse($#,1,$1,default)])
+
+
+# AC_PATH_TOOL_PREFIX - find a file program which can recognise shared library
+AC_DEFUN([AC_PATH_TOOL_PREFIX],
+[AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
+[case $MAGIC_CMD in
+  /*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC_CMD="$MAGIC_CMD"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+  ac_dummy="ifelse([$2], , $PATH, [$2])"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$1; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/$1"
+      if test -n "$file_magic_test_file"; then
+       case $deplibs_check_method in
+       "file_magic "*)
+         file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+         MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+         if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+           egrep "$file_magic_regex" > /dev/null; then
+           :
+         else
+           cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+         fi ;;
+       esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC_CMD="$ac_save_MAGIC_CMD"
+  ;;
+esac])
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  AC_MSG_RESULT($MAGIC_CMD)
+else
+  AC_MSG_RESULT(no)
+fi
+])
+
+
+# AC_PATH_MAGIC - find a file program which can recognise a shared library
+AC_DEFUN([AC_PATH_MAGIC],
+[AC_REQUIRE([AC_CHECK_TOOL_PREFIX])dnl
+AC_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin:$PATH)
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+  if test -n "$ac_tool_prefix"; then
+    AC_PATH_TOOL_PREFIX(file, /usr/bin:$PATH)
+  else
+    MAGIC_CMD=:
+  fi
+fi
+])
+
+
+# AC_PROG_LD - find the path to the GNU or non-GNU linker
+AC_DEFUN([AC_PROG_LD],
+[AC_ARG_WITH(gnu-ld,
+[  --with-gnu-ld                 assume the C compiler uses GNU ld [default=no]],
+test "$withval" = no || with_gnu_ld=yes, with_gnu_ld=no)
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  AC_MSG_CHECKING([for ld used by GCC])
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [[\\/]* | [A-Za-z]:[\\/]*)]
+      re_direlt=['/[^/][^/]*/\.\./']
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+       ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  AC_MSG_CHECKING([for GNU ld])
+else
+  AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(lt_cv_path_LD,
+[if test -z "$LD"; then
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+       test "$with_gnu_ld" != no && break
+      else
+       test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  AC_MSG_RESULT($LD)
+else
+  AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+AC_PROG_LD_GNU
+])
+
+AC_DEFUN([AC_PROG_LD_GNU],
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], lt_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  lt_cv_prog_gnu_ld=yes
+else
+  lt_cv_prog_gnu_ld=no
+fi])
+with_gnu_ld=$lt_cv_prog_gnu_ld
+])
+
+# AC_PROG_LD_RELOAD_FLAG - find reload flag for linker
+#   -- PORTME Some linkers may need a different reload flag.
+AC_DEFUN([AC_PROG_LD_RELOAD_FLAG],
+[AC_CACHE_CHECK([for $LD option to reload object files], lt_cv_ld_reload_flag,
+[lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+])
+
+# AC_DEPLIBS_CHECK_METHOD - how to check for library dependencies
+#  -- PORTME fill in with the dynamic library characteristics
+AC_DEFUN([AC_DEPLIBS_CHECK_METHOD],
+[AC_CACHE_CHECK([how to recognise dependant libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+  lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)']
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw* |pw32*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='$OBJDUMP -f'
+  ;;
+
+darwin* | rhapsody*)
+  lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  case "$host_os" in
+  rhapsody* | darwin1.[012])
+    lt_cv_file_magic_test_file='/System/Library/Frameworks/System.framework/System'
+    ;;
+  *) # Darwin 1.3 on
+    lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+    ;;
+  esac
+  ;;
+
+freebsd* )
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    case $host_cpu in
+    i*86 )
+      # Not sure whether the presence of OpenBSD here was a mistake.
+      # Let's accept both of them until this is cleared up.
+      lt_cv_deplibs_check_method=['file_magic (FreeBSD|OpenBSD)/i[3-9]86 (compact )?demand paged shared library']
+      lt_cv_file_magic_cmd=/usr/bin/file
+      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+      ;;
+    esac
+  else
+    lt_cv_deplibs_check_method=pass_all
+  fi
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+hpux10.20*|hpux11*)
+  lt_cv_deplibs_check_method=['file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9].[0-9]) shared library']
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libc.sl
+  ;;
+
+irix5* | irix6*)
+  case $host_os in
+  irix5*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case $LD in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method=["file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"]
+    ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case $host_cpu in
+  alpha* | hppa* | i*86 | powerpc* | sparc* | ia64* )
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+    lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )'] ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+netbsd*)
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so\.[0-9]+\.[0-9]+$']
+  else
+    [lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so$']
+  fi
+  ;;
+
+newsos6)
+  [lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)']
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libnls.so
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+[sysv5uw[78]* | sysv4*uw2*)]
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case $host_vendor in
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  motorola)
+    lt_cv_deplibs_check_method=['file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]']
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  esac
+  ;;
+esac
+])
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+])
+
+
+# AC_PROG_NM - find the path to a BSD-compatible name lister
+AC_DEFUN([AC_PROG_NM],
+[AC_MSG_CHECKING([for BSD-compatible nm])
+AC_CACHE_VAL(lt_cv_path_NM,
+[if test -n "$NM"; then
+  # Let the user override the test.
+  lt_cv_path_NM="$NM"
+else
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    tmp_nm=$ac_dir/${ac_tool_prefix}nm
+    if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      # Tru64's nm complains that /dev/null is an invalid object file
+      if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+       lt_cv_path_NM="$tmp_nm -B"
+       break
+      elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+       lt_cv_path_NM="$tmp_nm -p"
+       break
+      else
+       lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+       continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi])
+NM="$lt_cv_path_NM"
+AC_MSG_RESULT([$NM])
+])
+
+# AC_CHECK_LIBM - check for math library
+AC_DEFUN([AC_CHECK_LIBM],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case $host in
+*-*-beos* | *-*-cygwin* | *-*-pw32*)
+  # These system don't have libm
+  ;;
+*-ncr-sysv4.3*)
+  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+  AC_CHECK_LIB(m, main, LIBM="$LIBM -lm")
+  ;;
+*)
+  AC_CHECK_LIB(m, main, LIBM="-lm")
+  ;;
+esac
+])
+
+# AC_LIBLTDL_CONVENIENCE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl convenience library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-convenience to the
+# configure arguments.  Note that LIBLTDL and INCLTDL are not
+# AC_SUBSTed, nor is AC_CONFIG_SUBDIRS called.  If DIR is not
+# provided, it is assumed to be `libltdl'.  LIBLTDL will be prefixed
+# with '${top_builddir}/' and INCLTDL will be prefixed with
+# '${top_srcdir}/' (note the single quotes!).  If your package is not
+# flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+AC_DEFUN([AC_LIBLTDL_CONVENIENCE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  case $enable_ltdl_convenience in
+  no) AC_MSG_ERROR([this package needs a convenience libltdl]) ;;
+  "") enable_ltdl_convenience=yes
+      ac_configure_args="$ac_configure_args --enable-ltdl-convenience" ;;
+  esac
+  LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdlc.la
+  INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+])
+
+# AC_LIBLTDL_INSTALLABLE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl installable library and INCLTDL to the include flags for
+# the libltdl header and adds --enable-ltdl-install to the configure
+# arguments.  Note that LIBLTDL and INCLTDL are not AC_SUBSTed, nor is
+# AC_CONFIG_SUBDIRS called.  If DIR is not provided and an installed
+# libltdl is not found, it is assumed to be `libltdl'.  LIBLTDL will
+# be prefixed with '${top_builddir}/' and INCLTDL will be prefixed
+# with '${top_srcdir}/' (note the single quotes!).  If your package is
+# not flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+# In the future, this macro may have to be called after AC_PROG_LIBTOOL.
+AC_DEFUN([AC_LIBLTDL_INSTALLABLE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  AC_CHECK_LIB(ltdl, main,
+  [test x"$enable_ltdl_install" != xyes && enable_ltdl_install=no],
+  [if test x"$enable_ltdl_install" = xno; then
+     AC_MSG_WARN([libltdl not installed, but installation disabled])
+   else
+     enable_ltdl_install=yes
+   fi
+  ])
+  if test x"$enable_ltdl_install" = x"yes"; then
+    ac_configure_args="$ac_configure_args --enable-ltdl-install"
+    LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdl.la
+    INCLTDL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+  else
+    ac_configure_args="$ac_configure_args --enable-ltdl-install=no"
+    LIBLTDL="-lltdl"
+    INCLTDL=
+  fi
+])
  
-# AC_PROVIDE_IFELSE(MACRO-NAME, IF-PROVIDED, IF-NOT-PROVIDED)
-# -----------------------------------------------------------
-# If MACRO-NAME is provided do IF-PROVIDED, else IF-NOT-PROVIDED.
-# The purpose of this macro is to provide the user with a means to
-# check macros which are provided without letting her know how the
-# information is coded.
  # If this macro is not defined by Autoconf, define it here.
  ifdef([AC_PROVIDE_IFELSE],
        [],
@@ -885,88 +1627,184 @@ ifdef([AC_PROVIDE_IFELSE],
                [ifdef([AC_PROVIDE_$1],
                       [$2], [$3])])])
  
+# AC_LIBTOOL_F77 - enable support for fortran libraries
+AC_DEFUN([AC_LIBTOOL_F77], [AC_REQUIRE([_AC_LIBTOOL_F77])])
+
+AC_DEFUN([_AC_LIBTOOL_F77],
+[AC_REQUIRE([AC_PROG_F77])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-f77.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the fortran compiler.
+AR="$AR" LTCC="$CC" CC="$F77" F77="$F77" CFLAGS="$FFLAGS" CPPFLAGS="" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=F77 $ac_aux_dir/ltcf-f77.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
  
-# AM_INIT_AUTOMAKE(PACKAGE,VERSION, [NO-DEFINE])
-# ----------------------------------------------
-AC_DEFUN([AM_INIT_AUTOMAKE],
-[AC_REQUIRE([AC_PROG_INSTALL])dnl
-# test to see if srcdir already configured
-if test "`CDPATH=:; cd $srcdir && pwd`" != "`pwd`" &&
-   test -f $srcdir/config.status; then
-  AC_MSG_ERROR([source directory already configured; run \"make distclean\" there first])
-fi
+# AC_LIBTOOL_CXX - enable support for C++ libraries
+AC_DEFUN([AC_LIBTOOL_CXX], [AC_REQUIRE([_AC_LIBTOOL_CXX])])
+
+AC_DEFUN([_AC_LIBTOOL_CXX],
+[AC_REQUIRE([AC_PROG_CXX])
+AC_REQUIRE([AC_PROG_CXXCPP])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-cxx.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$CXX" CXX="$CXX" CFLAGS="$CXXFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=CXX $ac_aux_dir/ltcf-cxx.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
  
-# Define the identity of the package.
-PACKAGE=$1
-AC_SUBST(PACKAGE)dnl
-VERSION=$2
-AC_SUBST(VERSION)dnl
-ifelse([$3],,
-[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
-AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])
-
-# Autoconf 2.50 wants to disallow AM_ names.  We explicitly allow
-# the ones we care about.
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_CFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_CPPFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_CXXFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_OBJCFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_FFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_RFLAGS])])
-ifdef([m4_pattern_allow], [m4_pattern_allow([AM_GCJFLAGS])])
-
-# Some tools Automake needs.
-AC_REQUIRE([AM_SANITY_CHECK])dnl
-AC_REQUIRE([AC_ARG_PROGRAM])dnl
-AM_MISSING_PROG(ACLOCAL, aclocal)
-AM_MISSING_PROG(AUTOCONF, autoconf)
-AM_MISSING_PROG(AUTOMAKE, automake)
-AM_MISSING_PROG(AUTOHEADER, autoheader)
-AM_MISSING_PROG(MAKEINFO, makeinfo)
-AM_MISSING_PROG(AMTAR, tar)
-AM_MISSING_INSTALL_SH
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
-AC_REQUIRE([AC_PROG_AWK])dnl
-AC_REQUIRE([AC_PROG_MAKE_SET])dnl
-AC_REQUIRE([AM_DEP_TRACK])dnl
-AC_REQUIRE([AM_SET_DEPDIR])dnl
-AC_PROVIDE_IFELSE([AC_PROG_][CC],
-                  [AM_DEPENDENCIES(CC)],
-                  [define([AC_PROG_][CC],
-                          defn([AC_PROG_][CC])[AM_DEPENDENCIES(CC)])])dnl
-AC_PROVIDE_IFELSE([AC_PROG_][CXX],
-                  [AM_DEPENDENCIES(CXX)],
-                  [define([AC_PROG_][CXX],
-                          defn([AC_PROG_][CXX])[AM_DEPENDENCIES(CXX)])])dnl
+# AC_LIBTOOL_GCJ - enable support for GCJ libraries
+AC_DEFUN([AC_LIBTOOL_GCJ],[AC_REQUIRE([_AC_LIBTOOL_GCJ])])
+
+AC_DEFUN([_AC_LIBTOOL_GCJ],
+[AC_REQUIRE([AC_PROG_LIBTOOL])
+AC_PROVIDE_IFELSE([AC_PROG_GCJ],[],
+  [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],[],
+    [AC_PROVIDE_IFELSE([LT_AC_PROG_GCJ],[],
+      [ifdef([AC_PROG_GCJ],[AC_REQUIRE([AC_PROG_GCJ])],
+         [ifdef([A][M_PROG_GCJ],[AC_REQUIRE([A][M_PROG_GCJ])],
+           [AC_REQUIRE([A][C_PROG_GCJ_OR_A][M_PROG_GCJ])])])])])])
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-gcj.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+dnl Make sure LTCC is set to the C compiler, i.e. set LTCC before CC
+dnl is set to the C++ compiler.
+AR="$AR" LTCC="$CC" CC="$GCJ" CFLAGS="$GCJFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=GCJ $ac_aux_dir/ltcf-gcj.sh $host \
+|| AC_MSG_ERROR([libtool tag configuration failed])
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+dnl old names
+AC_DEFUN([AM_PROG_LIBTOOL],   [AC_PROG_LIBTOOL])
+AC_DEFUN([AM_ENABLE_SHARED],  [AC_ENABLE_SHARED($@)])
+AC_DEFUN([AM_ENABLE_STATIC],  [AC_ENABLE_STATIC($@)])
+AC_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
+AC_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
+AC_DEFUN([AM_PROG_LD],        [AC_PROG_LD])
+AC_DEFUN([AM_PROG_NM],        [AC_PROG_NM])
+
+dnl This is just to silence aclocal about the macro not being used
+ifelse([AC_DISABLE_FAST_INSTALL])dnl
+ifelse([AC_DISABLE_SHARED])dnl
+
+AC_DEFUN([LT_AC_PROG_GCJ],
+[AC_CHECK_TOOL(GCJ, gcj, no)
+  test "x${GCJFLAGS+set}" = xset || GCJFLAGS="-g -O2"
+  AC_SUBST(GCJFLAGS)
  ])
  
+# Define a conditional.
+
+AC_DEFUN(AM_CONDITIONAL,
+[AC_SUBST($1_TRUE)
+AC_SUBST($1_FALSE)
+if $2; then
+  $1_TRUE=
+  $1_FALSE='#'
+else
+  $1_TRUE='#'
+  $1_FALSE=
+fi])
+
+# Do all the work for Automake.  This macro actually does too much --
+# some checks are only needed if your package does certain things.
+# But this isn't really a big deal.
+
+# serial 1
+
+dnl Usage:
+dnl AM_INIT_AUTOMAKE(package,version, [no-define])
+
+AC_DEFUN(AM_INIT_AUTOMAKE,
+[AC_REQUIRE([AC_PROG_INSTALL])
+PACKAGE=[$1]
+AC_SUBST(PACKAGE)
+VERSION=[$2]
+AC_SUBST(VERSION)
+dnl test to see if srcdir already configured
+if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then
+  AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+fi
+ifelse([$3],,
+AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
+AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package]))
+AC_REQUIRE([AM_SANITY_CHECK])
+AC_REQUIRE([AC_ARG_PROGRAM])
+dnl FIXME This is truly gross.
+missing_dir=`cd $ac_aux_dir && pwd`
+AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
+AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
+AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
+AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
+AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
+AC_REQUIRE([AC_PROG_MAKE_SET])])
+
  #
  # Check to make sure that the build environment is sane.
  #
  
-# serial 3
-
-# AM_SANITY_CHECK
-# ---------------
-AC_DEFUN([AM_SANITY_CHECK],
+AC_DEFUN(AM_SANITY_CHECK,
  [AC_MSG_CHECKING([whether build environment is sane])
  # Just in case
  sleep 1
-echo timestamp > conftest.file
+echo timestamp > conftestfile
  # Do `set' in a subshell so we don't clobber the current shell's
  # arguments.  Must try -L first in case configure is actually a
  # symlink; some systems play weird games with the mod time of symlinks
  # (eg FreeBSD returns the mod time of the symlink's containing
  # directory).
  if (
-   set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
-   if test "$[*]" = "X"; then
+   set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
+   if test "[$]*" = "X"; then
        # -L didn't work.
-      set X `ls -t $srcdir/configure conftest.file`
+      set X `ls -t $srcdir/configure conftestfile`
     fi
-   if test "$[*]" != "X $srcdir/configure conftest.file" \
-      && test "$[*]" != "X conftest.file $srcdir/configure"; then
+   if test "[$]*" != "X $srcdir/configure conftestfile" \
+      && test "[$]*" != "X conftestfile $srcdir/configure"; then
  
        # If neither matched, then we have a broken ls.  This can happen
        # if, for instance, CONFIG_SHELL is bash and it inherits a
@@ -976,7 +1814,7 @@ if (
  alias in your environment])
     fi
  
-   test "$[2]" = conftest.file
+   test "[$]2" = conftestfile
     )
  then
     # Ok.
@@ -988,305 +1826,42 @@ fi
  rm -f conftest*
  AC_MSG_RESULT(yes)])
  
-
-# serial 2
-
-# AM_MISSING_PROG(NAME, PROGRAM)
-# ------------------------------
-AC_DEFUN([AM_MISSING_PROG],
-[AC_REQUIRE([AM_MISSING_HAS_RUN])
-$1=${$1-"${am_missing_run}$2"}
-AC_SUBST($1)])
-
-
-# AM_MISSING_INSTALL_SH
-# ---------------------
-# Like AM_MISSING_PROG, but only looks for install-sh.
-AC_DEFUN([AM_MISSING_INSTALL_SH],
-[AC_REQUIRE([AM_MISSING_HAS_RUN])
-if test -z "$install_sh"; then
-   for install_sh in "$ac_aux_dir/install-sh" \
-                     "$ac_aux_dir/install.sh" \
-                     "${am_missing_run}${ac_auxdir}/install-sh";
-   do
-     test -f "$install_sh" && break
-   done
-   # FIXME: an evil hack: we remove the SHELL invocation from
-   # install_sh because automake adds it back in.  Sigh.
-   install_sh=`echo $install_sh | sed -e 's/\${SHELL}//'`
-fi
-AC_SUBST(install_sh)])
-
-
-# AM_MISSING_HAS_RUN
-# ------------------
-# Define MISSING if not defined so far and test if it supports --run.
-# If it does, set am_missing_run to use it, otherwise, to nothing.
-AC_DEFUN([AM_MISSING_HAS_RUN],
-[test x"${MISSING+set}" = xset ||
-  MISSING="\${SHELL} `CDPATH=:; cd $ac_aux_dir && pwd`/missing"
-# Use eval to expand $SHELL
-if eval "$MISSING --run :"; then
-  am_missing_run="$MISSING --run "
-else
-  am_missing_run=
-  am_backtick='`'
-  AC_MSG_WARN([${am_backtick}missing' script is too old or missing])
-fi
-])
-
-# serial 3
-
-# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
-# written in clear, in which case automake, when reading aclocal.m4,
-# will think it sees a *use*, and therefore will trigger all it's
-# C support machinery.  Also note that it means that autoscan, seeing
-# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
-
-# AM_DEPENDENCIES(NAME)
-# ---------------------
-# See how the compiler implements dependency checking.
-# NAME is "CC", "CXX" or "OBJC".
-# We try a few techniques and use that to set a single cache variable.
-AC_DEFUN([AM_DEPENDENCIES],
-[AC_REQUIRE([AM_SET_DEPDIR])dnl
-AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
-ifelse([$1], CC,
-       [AC_REQUIRE([AC_PROG_][CC])dnl
-AC_REQUIRE([AC_PROG_][CPP])
-depcc="$CC"
-depcpp="$CPP"],
-       [$1], CXX, [AC_REQUIRE([AC_PROG_][CXX])dnl
-AC_REQUIRE([AC_PROG_][CXXCPP])
-depcc="$CXX"
-depcpp="$CXXCPP"],
-       [$1], OBJC, [am_cv_OBJC_dependencies_compiler_type=gcc],
-       [AC_REQUIRE([AC_PROG_][$1])dnl
-depcc="$$1"
-depcpp=""])
-
-AC_REQUIRE([AM_MAKE_INCLUDE])
-
-AC_CACHE_CHECK([dependency style of $depcc],
-               [am_cv_$1_dependencies_compiler_type],
-[if test -z "$AMDEP"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  mkdir confdir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" confdir
-  cd confdir
-
-  am_cv_$1_dependencies_compiler_type=none
-  for depmode in `sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < "./depcomp"`; do
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    echo '#include "conftest.h"' > conftest.c
-    echo 'int i;' > conftest.h
-
-    case "$depmode" in
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-       continue
-      else
-       break
-      fi
-      ;;
-    none) break ;;
-    esac
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.
-    if depmode="$depmode" \
-       source=conftest.c object=conftest.o \
-       depfile=conftest.Po tmpdepfile=conftest.TPo \
-       $SHELL ./depcomp $depcc -c conftest.c -o conftest.o >/dev/null 2>&1 &&
-       grep conftest.h conftest.Po > /dev/null 2>&1; then
-      am_cv_$1_dependencies_compiler_type="$depmode"
-      break
-    fi
-  done
-
-  cd ..
-  rm -rf confdir
-else
-  am_cv_$1_dependencies_compiler_type=none
-fi
-])
-$1DEPMODE="depmode=$am_cv_$1_dependencies_compiler_type"
-AC_SUBST([$1DEPMODE])
-])
-
-
-# AM_SET_DEPDIR
-# -------------
-# Choose a directory name for dependency files.
-# This macro is AC_REQUIREd in AM_DEPENDENCIES
-AC_DEFUN([AM_SET_DEPDIR],
-[if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
-  DEPDIR=.deps
-  # We redirect because .deps might already exist and be populated.
-  # In this situation we don't want to see an error.
-  rmdir .deps > /dev/null 2>&1
-else
-  DEPDIR=_deps
-fi
-AC_SUBST(DEPDIR)
-])
-
-
-# AM_DEP_TRACK
-# ------------
-AC_DEFUN([AM_DEP_TRACK],
-[AC_ARG_ENABLE(dependency-tracking,
-[  --disable-dependency-tracking Speeds up one-time builds
-  --enable-dependency-tracking  Do not reject slow dependency extractors])
-if test "x$enable_dependency_tracking" = xno; then
-  AMDEP="#"
+dnl AM_MISSING_PROG(NAME, PROGRAM, DIRECTORY)
+dnl The program must properly implement --version.
+AC_DEFUN(AM_MISSING_PROG,
+[AC_MSG_CHECKING(for working $2)
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf.  Sigh.
+if ($2 --version) < /dev/null > /dev/null 2>&1; then
+   $1=$2
+   AC_MSG_RESULT(found)
  else
-  am_depcomp="$ac_aux_dir/depcomp"
-  if test ! -f "$am_depcomp"; then
-    AMDEP="#"
-  else
-    AMDEP=
-  fi
+   $1="$3/missing $2"
+   AC_MSG_RESULT(missing)
  fi
-AC_SUBST(AMDEP)
-if test -z "$AMDEP"; then
-  AMDEPBACKSLASH='\'
-else
-  AMDEPBACKSLASH=
-fi
-pushdef([subst], defn([AC_SUBST]))
-subst(AMDEPBACKSLASH)
-popdef([subst])
-])
-
-# Generate code to set up dependency tracking.
-# This macro should only be invoked once -- use via AC_REQUIRE.
-# Usage:
-# AM_OUTPUT_DEPENDENCY_COMMANDS
-
-#
-# This code is only required when automatic dependency tracking
-# is enabled.  FIXME.  This creates each `.P' file that we will
-# need in order to bootstrap the dependency handling code.
-AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],[
-AC_OUTPUT_COMMANDS([
-test x"$AMDEP" != x"" ||
-for mf in $CONFIG_FILES; do
-  case "$mf" in
-  Makefile) dirpart=.;;
-  */Makefile) dirpart=`echo "$mf" | sed -e 's|/[^/]*$||'`;;
-  *) continue;;
-  esac
-  grep '^DEP_FILES *= *[^ #]' < "$mf" > /dev/null || continue
-  # Extract the definition of DEP_FILES from the Makefile without
-  # running `make'.
-  DEPDIR=`sed -n -e '/^DEPDIR = / s///p' < "$mf"`
-  test -z "$DEPDIR" && continue
-  # When using ansi2knr, U may be empty or an underscore; expand it
-  U=`sed -n -e '/^U = / s///p' < "$mf"`
-  test -d "$dirpart/$DEPDIR" || mkdir "$dirpart/$DEPDIR"
-  # We invoke sed twice because it is the simplest approach to
-  # changing $(DEPDIR) to its actual value in the expansion.
-  for file in `sed -n -e '
-    /^DEP_FILES = .*\\\\$/ {
-      s/^DEP_FILES = //
-      :loop
-       s/\\\\$//
-       p
-       n
-       /\\\\$/ b loop
-      p
-    }
-    /^DEP_FILES = / s/^DEP_FILES = //p' < "$mf" | \
-       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
-    # Make sure the directory exists.
-    test -f "$dirpart/$file" && continue
-    fdir=`echo "$file" | sed -e 's|/[^/]*$||'`
-    $ac_aux_dir/mkinstalldirs "$dirpart/$fdir" > /dev/null 2>&1
-    # echo "creating $dirpart/$file"
-    echo '# dummy' > "$dirpart/$file"
-  done
-done
-], [AMDEP="$AMDEP"
-ac_aux_dir="$ac_aux_dir"])])
-
-# AM_MAKE_INCLUDE()
-# -----------------
-# Check to see how make treats includes.
-AC_DEFUN([AM_MAKE_INCLUDE],
-[am_make=${MAKE-make}
-# BSD make uses .include
-cat > confinc << 'END'
-doit:
-       @echo done
-END
-# If we don't find an include directive, just comment out the code.
-AC_MSG_CHECKING([for style of include used by $am_make])
-_am_include='#'
-for am_inc in include .include; do
-   echo "$am_inc confinc" > confmf
-   if test "`$am_make -f confmf 2> /dev/null`" = "done"; then
-      _am_include=$am_inc
-      break
-   fi
-done
-AC_SUBST(_am_include)
-AC_MSG_RESULT($_am_include)
-rm -f confinc confmf
-])
+AC_SUBST($1)])
  
  # Like AC_CONFIG_HEADER, but automatically create stamp file.
  
-# serial 3
-
-# When config.status generates a header, we must update the stamp-h file.
-# This file resides in the same directory as the config header
-# that is generated.  We must strip everything past the first ":",
-# and everything past the last "/".
-
-AC_PREREQ([2.12])
-
-AC_DEFUN([AM_CONFIG_HEADER],
-[AC_CONFIG_HEADER([$1])
-  AC_OUTPUT_COMMANDS(
-   ifelse(patsubst([$1], [[^ ]], []),
-         [],
-         [test -z "$CONFIG_HEADERS" || echo timestamp >dnl
-          patsubst([$1], [^\([^:]*/\)?.*], [\1])stamp-h]),
-  [am_indx=1
-  for am_file in $1; do
-    case " $CONFIG_HEADERS " in
-    *" $am_file "*)
-      echo timestamp > `echo $am_file | sed 's%:.*%%;s%[^/]*$%%'`stamp-h$am_indx
-      ;;
-    esac
-    am_indx=\`expr \$am_indx + 1\`
-  done])
-])
-
-# serial 2
-
-# AM_CONDITIONAL(NAME, SHELL-CONDITION)
-# -------------------------------------
-# Define a conditional.
-AC_DEFUN([AM_CONDITIONAL],
-[AC_SUBST([$1_TRUE])
-AC_SUBST([$1_FALSE])
-if $2; then
-  $1_TRUE=
-  $1_FALSE='#'
-else
-  $1_TRUE='#'
-  $1_FALSE=
-fi])
+AC_DEFUN(AM_CONFIG_HEADER,
+[AC_PREREQ([2.12])
+AC_CONFIG_HEADER([$1])
+dnl When config.status generates a header, we must update the stamp-h file.
+dnl This file resides in the same directory as the config header
+dnl that is generated.  We must strip everything past the first ":",
+dnl and everything past the last "/".
+AC_OUTPUT_COMMANDS(changequote(<<,>>)dnl
+ifelse(patsubst(<<$1>>, <<[^ ]>>, <<>>), <<>>,
+<<test -z "<<$>>CONFIG_HEADERS" || echo timestamp > patsubst(<<$1>>, <<^\([^:]*/\)?.*>>, <<\1>>)stamp-h<<>>dnl>>,
+<<am_indx=1
+for am_file in <<$1>>; do
+  case " <<$>>CONFIG_HEADERS " in
+  *" <<$>>am_file "*<<)>>
+    echo timestamp > `echo <<$>>am_file | sed -e 's%:.*%%' -e 's%[^/]*$%%'`stamp-h$am_indx
+    ;;
+  esac
+  am_indx=`expr "<<$>>am_indx" + 1`
+done<<>>dnl>>)
+changequote([,]))])
  
diff --git a/config/config.guess b/config/config.guess

index 54936b2edf88d980dd07516c86b1d33c01e47344..41bd1a3708394951a6071b48e5d0b9f5e3461cef 100755 (executable)
--- a/config/config.guess
+++ b/config/config.guess
@@ -208,7 +208,7 @@ main:
         jsr \$26,exit
         .end main
  EOF
-       $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+       $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null >/dev/null
         if test "$?" = 0 ; then
                 case `./$dummy` in
                         0-0)
@@ -592,7 +592,7 @@ EOF
                    exit (0);
                }
  EOF
-       (CCOPTS= $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null ) && HP_ARCH=`./$dummy`
+       (CCOPTS= $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null > /dev/null) && HP_ARCH=`./$dummy`
         if test -z "$HP_ARCH"; then HP_ARCH=hppa; fi
         rm -f $dummy.c $dummy
         fi ;;
@@ -831,7 +831,7 @@ main(argc, argv)
  }
  EOF
                 LIBC=""
-               $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null
+               $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null >/dev/null
                 if test "$?" = 0 ; then
                         ./$dummy | grep 1\.99 > /dev/null
                         if test "$?" = 0 ; then
@@ -875,7 +875,7 @@ EOF
                         .end main
  EOF
                 LIBC=""
-               $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+               $CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null >/dev/null
                 if test "$?" = 0 ; then
                         case `./$dummy` in
                         0-0)
@@ -923,7 +923,7 @@ EOF
    return 0;
  }
  EOF
-         $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+         $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null >/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
           rm -f $dummy.c $dummy
         elif test "${UNAME_MACHINE}" = "s390"; then
           echo s390-ibm-linux && exit 0
@@ -985,7 +985,7 @@ EOF
    return 0;
  }
  EOF
-         $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+         $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null >/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
           rm -f $dummy.c $dummy
           test x"${TENTATIVE}" != x && echo "${TENTATIVE}" && exit 0
         fi ;;
diff --git a/config/ltcf-c.sh b/config/ltcf-c.sh

new file mode 100644 (file)

index 0000000..d9bbae9
--- /dev/null
+++ b/config/ltcf-c.sh
@@ -0,0 +1,798 @@
+#### This script is meant to be sourced by ltconfig.
+
+# ltcf-c.sh - Create a C compiler specific configuration
+#
+# Copyright (C) 1996-2000, 2001 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Source file extension for C test sources.
+ac_ext=c
+
+# Object file extension for compiled C test sources.
+objext=o
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code="int some_variable = 0;"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code='main(){return(0);}'
+
+## Linker Characteristics
+case $host_os in
+cygwin* | mingw*)
+  # FIXME: the MSVC++ port hasn't been tested in a loooong time
+  # When not using gcc, we currently assume that we are using
+  # Microsoft Visual C++.
+  if test "$with_gcc" != yes; then
+    with_gnu_ld=no
+  fi
+  ;;
+
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+  # If archive_cmds runs LD, not CC, wlarc should be empty
+  wlarc='${wl}'
+
+  # See if GNU ld supports shared libraries.
+  case $host_os in
+  aix3* | aix4* | aix5*)
+    # On AIX, the GNU linker is very broken
+    ld_shlibs=no
+    cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+
+    # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+    # that the semantics of dynamic libraries on AmigaOS, at least up
+    # to version 4, is to share data among multiple programs linked
+    # with the same dynamic library.  Since this doesn't match the
+    # behavior of shared libraries on other platforms, we can use
+    # them.
+    ld_shlibs=no
+    ;;
+
+  beos*)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      allow_undefined_flag=unsupported
+      # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+      # support --undefined.  This deserves some investigation.  FIXME
+      archive_cmds='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  cygwin* | mingw*)
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec='-L$libdir'
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+
+    extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+      sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/impgen.c~
+      test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+      if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+      else $CC -o impgen impgen.c ; fi)~
+      $output_objdir/impgen $dir/$soroot > $output_objdir/$soname-def'
+
+    old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+    # cygwin and mingw dlls have different entry points and sets of symbols
+    # to exclude.
+    # FIXME: what about values for MSVC?
+    dll_entry=__cygwin_dll_entry@12
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+    case $host_os in
+    mingw*)
+      # mingw values
+      dll_entry=_DllMainCRTStartup@12
+      dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+      ;;
+    esac
+
+    # mingw and cygwin differ, and it's simplest to just exclude the union
+    # of the two symbol sets.
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+    # recent cygwin and mingw systems supply a stub DllMain which the user
+    # can override, but on older systems we have to supply one (in ltdll.c)
+    if test "x$lt_cv_need_dllmain" = "xyes"; then
+      ltdll_obj='$output_objdir/$soname-ltdll.'"$objext "
+      ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/$soname-ltdll.c~
+       test -f $output_objdir/$soname-ltdll.$objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+    else
+      ltdll_obj=
+      ltdll_cmds=
+    fi
+
+    # Extract the symbol export list from an `--export-all' def file,
+    # then regenerate the def file from the symbol export list, so that
+    # the compiled dll only exports the symbol export list.
+    # Be careful not to strip the DATA tag left be newer dlltools.
+    export_symbols_cmds="$ltdll_cmds"'
+      $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+      sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+    # If the export-symbols file already is a .def file (1st line
+    # is EXPORTS), use it as is.
+    # If DATA tags from a recent dlltool are present, honour them!
+    archive_expsym_cmds='if test "x`head -1 $export_symbols`" = xEXPORTS; then
+        cp $export_symbols $output_objdir/$soname-def;
+      else
+        echo EXPORTS > $output_objdir/$soname-def;
+        _lt_hint=1;
+        cat $export_symbols | while read symbol; do
+         set dummy \$symbol;
+         case \[$]# in
+           2) echo "   \[$]2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+           *) echo "     \[$]2 @ \$_lt_hint \[$]3 ; " >> $output_objdir/$soname-def;;
+         esac;
+         _lt_hint=`expr 1 + \$_lt_hint`;
+        done;
+      fi~
+      '"$ltdll_cmds"'
+      $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp --output-lib $output_objdir/$libname.dll.a~
+      $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+    ;;
+
+  darwin* | rhapsody*)
+    allow_undefined_flag='-undefined suppress'
+    archive_cmds='$CC `test .$module = .yes && echo -bundle || echo -dynamiclib` $allow_undefined_flag -o $lib $libobjs $deplibs $linkopts -install_name $rpath/$soname `test -n "$verstring" -a x$verstring != x0.0 && echo $verstring`'
+    # We need to add '_' to the symbols in $export_symbols first
+    #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    whole_archive_flag_spec='-all_load $convenience'
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+      wlarc=
+    else
+      archive_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    fi
+    ;;
+
+  solaris* | sysv5*)
+    if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+      ld_shlibs=no
+      cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+    elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    wlarc=
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+  esac
+
+  if test "$ld_shlibs" = yes; then
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+    export_dynamic_flag_spec='${wl}--export-dynamic'
+    case $host_os in
+    cygwin* | mingw*)
+      # dlltool doesn't understand --whole-archive et. al.
+      whole_archive_flag_spec=
+      ;;
+    *)
+      # ancient GNU ld didn't support --whole-archive et. al.
+      if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+       whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+      else
+       whole_archive_flag_spec=
+      fi
+      ;;
+    esac
+  fi
+else
+  # PORTME fill in a description of your system's linker (not GNU ld)
+  case $host_os in
+  aix3*)
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+    archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+    # Note: this linker hardcodes the directories in LIBPATH if there
+    # are no directories specified by -L.
+    hardcode_minus_L=yes
+    if test "$with_gcc" = yes && test -z "$link_static_flag"; then
+      # Neither direct hardcoding nor static linking is supported with a
+      # broken collect2.
+      hardcode_direct=unsupported
+    fi
+    ;;
+
+  aix4* | aix5*)
+    hardcode_direct=yes
+    hardcode_libdir_separator=':'
+    link_all_deplibs=yes
+    # When large executables or shared objects are built, AIX ld can
+    # have problems creating the table of contents.  If linking a library
+    # or program results in "error TOC overflow" add -mminimal-toc to
+    # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+    # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+    if test "$with_gcc" = yes; then
+      case $host_os in aix4.[012]|aix4.[012].*)
+      # We only want to do this on AIX 4.2 and lower, the check
+      # below for broken collect2 doesn't work under 4.3+
+        collect2name=`${CC} -print-prog-name=collect2`
+        if test -f "$collect2name" && \
+          strings "$collect2name" | grep resolve_lib_name >/dev/null
+        then
+         # We have reworked collect2
+         hardcode_direct=yes
+        else
+         # We have old collect2
+         hardcode_direct=unsupported
+         # It fails to find uninstalled libraries when the uninstalled
+         # path is not listed in the libpath.  Setting hardcode_minus_L
+         # to unsupported forces relinking
+         hardcode_minus_L=yes
+         hardcode_libdir_flag_spec='-L$libdir'
+         hardcode_libdir_separator=
+        fi
+      esac
+      shared_flag='-shared'
+    else
+      # not using gcc
+      if test "$host_cpu" = ia64; then
+        shared_flag='${wl}-G'
+      else
+        shared_flag='${wl}-bM:SRE'
+      fi
+    fi
+
+    if test "$host_cpu" = ia64; then
+      # On IA64, the linker does run time linking by default, so we don't
+      # have to do anything special.
+      aix_use_runtimelinking=no
+      exp_sym_flag='-Bexport'
+      no_entry_flag=""
+    else
+      # Test if we are trying to use run time linking, or normal AIX style linking.
+      # If -brtl is somewhere in LDFLAGS, we need to do run time linking.
+      aix_use_runtimelinking=no
+      for ld_flag in $LDFLAGS; do
+        if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl" ); then
+          aix_use_runtimelinking=yes
+          break
+        fi
+      done
+      exp_sym_flag='-bexport'
+      no_entry_flag='-bnoentry'
+    fi
+    # -bexpall does not export symbols beginning with underscore (_)
+    always_export_symbols=yes
+    if test "$aix_use_runtimelinking" = yes; then
+      # Warning - without using the other run time loading flags (-brtl), -berok will
+      #           link without error, but may produce a broken library.
+      allow_undefined_flag=' ${wl}-berok'
+      hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+      archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+    else
+      if test "$host_cpu" = ia64; then
+        hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib'
+        allow_undefined_flag="-z nodefs"
+        archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+      else
+        allow_undefined_flag=' ${wl}-berok'
+        # -bexpall does not export symbols beginning with underscore (_)
+        always_export_symbols=yes
+        # Exported symbols can be pulled into shared objects from archives
+        whole_archive_flag_spec=' '
+        build_libtool_need_lc=yes
+        hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+        # This is similar to how AIX traditionally builds it's shared libraries.
+        archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bE:$export_symbols ${wl}-bnoentry${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+      fi
+    fi
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    # see comment about different semantics on the GNU ld section
+    ld_shlibs=no
+    ;;
+
+  cygwin* | mingw*)
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec=' '
+    allow_undefined_flag=unsupported
+    # Tell ltmain to make .lib files, not .a files.
+    libext=lib
+    # FIXME: Setting linknames here is a bad hack.
+    archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+    # The linker will automatically build a .lib file if we build a DLL.
+    old_archive_from_new_cmds='true'
+    # FIXME: Should let the user specify the lib program.
+    old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+    fix_srcfile_path='`cygpath -w "$srcfile"`'
+    ;;
+
+  freebsd1*)
+    ld_shlibs=no
+    ;;
+
+  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+  # support.  Future versions do this automatically, but an explicit c++rt0.o
+  # does not break anything, and helps significantly (at the cost of a little
+  # extra space).
+  freebsd2.2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+  freebsd2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+  freebsd*)
+    archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  hpux9* | hpux10* | hpux11*)
+    case $host_os in
+    hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+    *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+    esac
+    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_direct=yes
+    hardcode_minus_L=yes # Not in the search PATH, but as the default
+                        # location of the library.
+    export_dynamic_flag_spec='${wl}-E'
+    ;;
+
+  irix5* | irix6*)
+    if test "$with_gcc" = yes; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+    else
+      archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    link_all_deplibs=yes
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+    else
+      archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+    fi
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  newsos6)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linkopts'
+    hardcode_direct=yes
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_shlibpath_var=no
+    ;;
+
+  openbsd*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  os2*)
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    allow_undefined_flag=unsupported
+    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+    old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+    ;;
+
+  osf3*)
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  osf4* | osf5*)       # as osf3* with the addition of -msym flag
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+      archive_expsym_cmds='for i in `cat $export_symbols`; do printf "-exported_symbol " >> $lib.exp; echo "\$i" >> $lib.exp; done; echo "-hidden">> $lib.exp~
+      $LD -shared${allow_undefined_flag} -input $lib.exp $linker_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib~$rm $lib.exp'
+
+      # cc supports -rpath directly
+      hardcode_libdir_flag_spec='-rpath $libdir'
+    fi
+    hardcode_libdir_separator=:
+    ;;
+
+  sco3.2v5*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    runpath_var=LD_RUN_PATH
+    hardcode_runpath_var=yes
+    ;;
+
+  solaris*)
+    no_undefined_flag=' -z defs'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+               $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_shlibpath_var=no
+    case $host_os in
+    solaris2.[0-5] | solaris2.[0-5].*) ;;
+    *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+      whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+    esac
+    link_all_deplibs=yes
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+    ;;
+
+  sysv4.3*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    export_dynamic_flag_spec='-Bexport'
+    ;;
+
+  sysv5*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+               $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec=
+    hardcode_shlibpath_var=no
+    runpath_var='LD_RUN_PATH'
+    ;;
+
+  uts4*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  dgux*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4*MP*)
+    if test -d /usr/nec; then
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var=no
+      runpath_var=LD_RUN_PATH
+      hardcode_runpath_var=yes
+      ld_shlibs=yes
+    fi
+    ;;
+
+  sysv4.2uw2*)
+    archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=no
+    hardcode_shlibpath_var=no
+    hardcode_runpath_var=yes
+    runpath_var=LD_RUN_PATH
+    ;;
+
+  sysv5uw7* | unixware7*)
+    no_undefined_flag='${wl}-z ${wl}text'
+    if test "$GCC" = yes; then
+      archive_cmds='$CC -shared ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    else
+      archive_cmds='$CC -G ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    fi
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    ld_shlibs=no
+    ;;
+  esac
+fi
+
+## Compiler Characteristics: PIC flags, static flags, etc
+if test "X${ac_cv_prog_cc_pic+set}" = Xset; then
+  :
+else
+  ac_cv_prog_cc_pic=
+  ac_cv_prog_cc_shlib=
+  ac_cv_prog_cc_wl=
+  ac_cv_prog_cc_static=
+  ac_cv_prog_cc_no_builtin=
+  ac_cv_prog_cc_can_build_shared=$can_build_shared
+
+  if test "$with_gcc" = yes; then
+    ac_cv_prog_cc_wl='-Wl,'
+    ac_cv_prog_cc_static='-static'
+
+    case $host_os in
+    aix*)
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+        # AIX 5 now supports IA64 processor
+        lt_cv_prog_cc_static='-Bstatic'
+      else
+        lt_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+      fi
+      ;;
+    amigaos*)
+      # FIXME: we need at least 68020 code to build shared libraries, but
+      # adding the `-m68020' flag to GCC prevents building anything better,
+      # like `-m68040'.
+      ac_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+      ;;
+    beos* | irix5* | irix6* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      lt_cv_prog_cc_pic='-fno-common'
+      ;;
+    *djgpp*)
+      # DJGPP does not support shared libraries at all
+      ac_cv_prog_cc_pic=
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+        ac_cv_prog_cc_pic=-Kconform_pic
+      fi
+      ;;
+    *)
+      ac_cv_prog_cc_pic='-fPIC'
+      ;;
+    esac
+  else
+    # PORTME Check for PIC flags for the system compiler.
+    case $host_os in
+    aix*)
+     # All AIX code is PIC.
+      ac_cv_prog_cc_static="$ac_cv_prog_cc_static ${ac_cv_prog_cc_wl}-lC"
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      # Is there a better ac_cv_prog_cc_static that works with the bundled CC?
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static="${ac_cv_prog_cc_wl}-a ${ac_cv_prog_cc_wl}archive"
+      ac_cv_prog_cc_pic='+Z'
+      ;;
+
+    irix5* | irix6*)
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      # PIC (with -KPIC) is the default.
+      ;;
+
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+
+    newsos6)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      # All OSF/1 code is PIC.
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      ;;
+
+    sco3.2v5*)
+      ac_cv_prog_cc_pic='-Kpic'
+      ac_cv_prog_cc_static='-dn'
+      ac_cv_prog_cc_shlib='-belf'
+      ;;
+
+    solaris*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    sunos4*)
+      ac_cv_prog_cc_pic='-PIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Qoption ld '
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    uts4*)
+      ac_cv_prog_cc_pic='-pic'
+      ac_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+       ac_cv_prog_cc_pic='-Kconform_pic'
+       ac_cv_prog_cc_static='-Bstatic'
+      fi
+      ;;
+
+    *)
+      ac_cv_prog_cc_can_build_shared=no
+      ;;
+    esac
+  fi
+  case "$host_os" in
+      # Platforms which do not suport PIC and -DPIC is meaningless
+      # on them:
+      *djgpp*)
+        ac_cv_prog_cc_pic=
+        ;;
+      *)
+        ac_cv_prog_cc_pic="$ac_cv_prog_cc_pic -DPIC"
+        ;;
+  esac
+fi
+
+need_lc=yes
+if test "$enable_shared" = yes && test "$with_gcc" = yes; then
+  case $archive_cmds in
+  *'~'*)
+    # FIXME: we may have to deal with multi-command sequences.
+    ;;
+  '$CC '*)
+    # Test whether the compiler implicitly links with -lc since on some
+    # systems, -lgcc has to come before -lc. If gcc already passes -lc
+    # to ld, don't add -lc before -lgcc.
+    echo $ac_n "checking whether -lc should be explicitly linked in... $ac_c" 1>&6
+    if eval "test \"`echo '$''{'ac_cv_archive_cmds_needs_lc'+set}'`\" = set"; then
+      echo $ac_n "(cached) $ac_c" 1>&6
+      need_lc=$ac_cv_archive_cmds_needs_lc
+    else
+      $rm conftest*
+      echo "static int dummy;" > conftest.$ac_ext
+      if { (eval echo ltcf-c.sh:need_lc: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; }; then
+       # Append any warnings to the config.log.
+       cat conftest.err 1>&5
+       soname=conftest
+       lib=conftest
+       libobjs=conftest.$ac_objext
+       deplibs=
+       wl=$ac_cv_prog_cc_wl
+       compiler_flags=-v
+       linker_flags=-v
+       verstring=
+       output_objdir=.
+       libname=conftest
+       save_allow_undefined_flag=$allow_undefined_flag
+       allow_undefined_flag=
+       if { (eval echo ltcf-c.sh:need_lc: \"$archive_cmds\") 1>&5; (eval $archive_cmds) 2>&1 | grep " -lc " 1>&5 ; }; then
+         need_lc=no
+       fi
+       allow_undefined_flag=$save_allow_undefined_flag
+      else
+       cat conftest.err 1>&5
+      fi
+    fi
+    $rm conftest*
+    echo "$ac_t$need_lc" 1>&6
+    ;;
+  esac
+fi
+ac_cv_archive_cmds_needs_lc=$need_lc
diff --git a/config/ltcf-f77.sh b/config/ltcf-f77.sh

new file mode 100644 (file)

index 0000000..42d3077
--- /dev/null
+++ b/config/ltcf-f77.sh
@@ -0,0 +1,816 @@
+#### This script is meant to be sourced by ltconfig.
+
+# ltcf-f77.sh - Create a fortran compiler specific configuration
+#
+# Copyright (C) 1996-1999, 2000, 2001 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# NB: This is somewhat of a hack to support fortran for the stuff
+# we need, don't trust it as a complete implementation!
+#
+# Original C++ support by:Gary V. Vaughan <gvv@techie.com>
+#    Alexandre Oliva <oliva@lsd.ic.unicamp.br>
+#    Ossama Othman <ossama@debian.org>
+#    Thomas Thanner <tanner@gmx.de>
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Source file extension for fortran test sources.
+ac_ext=f
+# Source extension for f77 files to be preprocessed
+ac_pre_ext=F
+
+# Object file extension for compiled C++ test sources.
+objext=o
+
+# Code to be used in simple compile tests
+lt_simple_compile_test_code="      subroutine t\n      return\n      end\n"
+
+# Code to be used in simple link tests
+lt_simple_link_test_code="      program t\n      end\n"
+
+# fortran compiler
+F77=${F77-f77}
+
+# ltmain only uses $CC for tagged configurations so we simply trust CC is set....
+
+## Linker Characteristics
+case $host_os in
+cygwin* | mingw*)
+  # FIXME: the MSVC++ port hasn't been tested in a loooong time
+  # When not using gcc, we currently assume that we are using
+  # Microsoft Visual C++.
+  if test "$with_gcc" != yes; then
+    with_gnu_ld=no
+  fi
+  ;;
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+  # If archive_cmds runs LD, not CC, wlarc should be empty
+  wlarc='${wl}'
+
+  # See if GNU ld supports shared libraries.
+  case $host_os in
+  aix3* | aix4* | aix5*)
+    # On AIX, the GNU linker is very broken
+    ld_shlibs=no
+    cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+
+    # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+    # that the semantics of dynamic libraries on AmigaOS, at least up
+    # to version 4, is to share data among multiple programs linked
+    # with the same dynamic library.  Since this doesn't match the
+    # behavior of shared libraries on other platforms, we can use
+    # them.
+    ld_shlibs=no
+    ;;
+
+  beos*)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      allow_undefined_flag=unsupported
+      # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+      # support --undefined.  This deserves some investigation.  FIXME
+      archive_cmds='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  cygwin* | mingw*)
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec='-L$libdir'
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+
+    extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+      sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/impgen.c~
+      test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+      if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+      else $CC -o impgen impgen.c ; fi)~
+      $output_objdir/impgen $dir/$soroot > $output_objdir/$soname-def'
+
+    old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+    # cygwin and mingw dlls have different entry points and sets of symbols
+    # to exclude.
+    # FIXME: what about values for MSVC?
+    dll_entry=__cygwin_dll_entry@12
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+    case $host_os in
+    mingw*)
+      # mingw values
+      dll_entry=_DllMainCRTStartup@12
+      dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+      ;;
+    esac
+
+    # mingw and cygwin differ, and it's simplest to just exclude the union
+    # of the two symbol sets.
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+    # recent cygwin and mingw systems supply a stub DllMain which the user
+    # can override, but on older systems we have to supply one (in ltdll.c)
+    if test "x$lt_cv_need_dllmain" = "xyes"; then
+      ltdll_obj='$output_objdir/$soname-ltdll.'"$objext "
+      ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/$soname-ltdll.c~
+       test -f $output_objdir/$soname-ltdll.$objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+    else
+      ltdll_obj=
+      ltdll_cmds=
+    fi
+
+    # Extract the symbol export list from an `--export-all' def file,
+    # then regenerate the def file from the symbol export list, so that
+    # the compiled dll only exports the symbol export list.
+    # Be careful not to strip the DATA tag left be newer dlltools.
+    export_symbols_cmds="$ltdll_cmds"'
+      $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+      sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+    # If the export-symbols file already is a .def file (1st line
+    # is EXPORTS), use it as is.
+    # If DATA tags from a recent dlltool are present, honour them!
+    archive_expsym_cmds='if test "x`head -1 $export_symbols`" = xEXPORTS; then
+        cp $export_symbols $output_objdir/$soname-def;
+      else
+        echo EXPORTS > $output_objdir/$soname-def;
+        _lt_hint=1;
+        cat $export_symbols | while read symbol; do
+         set dummy \$symbol;
+         case \[$]# in
+           2) echo "   \[$]2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+           *) echo "     \[$]2 @ \$_lt_hint \[$]3 ; " >> $output_objdir/$soname-def;;
+         esac;
+         _lt_hint=`expr 1 + \$_lt_hint`;
+        done;
+      fi~
+      '"$ltdll_cmds"'
+      $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp --output-lib $output_objdir/$libname.dll.a~
+      $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+    ;;
+
+  darwin* | rhapsody*)
+    allow_undefined_flag='-undefined suppress'
+    archive_cmds='$CC `test .$module = .yes && echo -bundle || echo -dynamiclib` $allow_undefined_flag -o $lib $libobjs $deplibs $linkopts -install_name $rpath/$soname `test -n "$verstring" -a x$verstring != x0.0 && echo $verstring`'
+    # We need to add '_' to the symbols in $export_symbols first
+    #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    whole_archive_flag_spec='-all_load $convenience'
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+      wlarc=
+    else
+      archive_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    fi
+    ;;
+
+  solaris* | sysv5*)
+    if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+      ld_shlibs=no
+      cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+    elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    wlarc=
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+  esac
+
+  if test "$ld_shlibs" = yes; then
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+    export_dynamic_flag_spec='${wl}--export-dynamic'
+    case $host_os in
+    cygwin* | mingw*)
+      # dlltool doesn't understand --whole-archive et. al.
+      whole_archive_flag_spec=
+      ;;
+    *)
+      # ancient GNU ld didn't support --whole-archive et. al.
+      if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+       whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+      else
+       whole_archive_flag_spec=
+      fi
+      ;;
+    esac
+  fi
+else
+  # PORTME fill in a description of your system's linker (not GNU ld)
+  case $host_os in
+  aix3*)
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+    archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+    # Note: this linker hardcodes the directories in LIBPATH if there
+    # are no directories specified by -L.
+    hardcode_minus_L=yes
+    if test "$with_gcc" = yes && test -z "$link_static_flag"; then
+      # Neither direct hardcoding nor static linking is supported with a
+      # broken collect2.
+      hardcode_direct=unsupported
+    fi
+    ;;
+
+  aix4* | aix5*)
+    hardcode_direct=yes
+    hardcode_libdir_separator=':'
+    link_all_deplibs=yes
+    # When large executables or shared objects are built, AIX ld can
+    # have problems creating the table of contents.  If linking a library
+    # or program results in "error TOC overflow" add -mminimal-toc to
+    # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+    # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+    if test "$with_gcc" = yes; then
+      case $host_os in aix4.[012]|aix4.[012].*)
+      # We only want to do this on AIX 4.2 and lower, the check
+      # below for broken collect2 doesn't work under 4.3+
+        collect2name=`${CC} -print-prog-name=collect2`
+        if test -f "$collect2name" && \
+          strings "$collect2name" | grep resolve_lib_name >/dev/null
+        then
+         # We have reworked collect2
+         hardcode_direct=yes
+        else
+         # We have old collect2
+         hardcode_direct=unsupported
+         # It fails to find uninstalled libraries when the uninstalled
+         # path is not listed in the libpath.  Setting hardcode_minus_L
+         # to unsupported forces relinking
+         hardcode_minus_L=yes
+         hardcode_libdir_flag_spec='-L$libdir'
+         hardcode_libdir_separator=
+        fi
+      esac
+      shared_flag='-shared'
+    else
+      # not using gcc
+      if test "$host_cpu" = ia64; then
+        shared_flag='${wl}-G'
+      else
+        shared_flag='${wl}-bM:SRE'
+      fi
+    fi
+
+    if test "$host_cpu" = ia64; then
+      # On IA64, the linker does run time linking by default, so we don't
+      # have to do anything special.
+      aix_use_runtimelinking=no
+      exp_sym_flag='-Bexport'
+      no_entry_flag=""
+    else
+      # Test if we are trying to use run time linking, or normal AIX style linking.
+      # If -brtl is somewhere in LDFLAGS, we need to do run time linking.
+      aix_use_runtimelinking=no
+      for ld_flag in $LDFLAGS; do
+        if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl" ); then
+          aix_use_runtimelinking=yes
+          break
+        fi
+      done
+      exp_sym_flag='-bexport'
+      no_entry_flag='-bnoentry'
+    fi
+    # -bexpall does not export symbols beginning with underscore (_)
+    always_export_symbols=yes
+    if test "$aix_use_runtimelinking" = yes; then
+      # Warning - without using the other run time loading flags (-brtl), -berok will
+      #           link without error, but may produce a broken library.
+      allow_undefined_flag=' ${wl}-berok'
+      hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+      archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+    else
+      if test "$host_cpu" = ia64; then
+        hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib'
+        allow_undefined_flag="-z nodefs"
+        archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+      else
+        allow_undefined_flag=' ${wl}-berok'
+        # -bexpall does not export symbols beginning with underscore (_)
+        always_export_symbols=yes
+        # Exported symbols can be pulled into shared objects from archives
+        whole_archive_flag_spec=' '
+        build_libtool_need_lc=yes
+        hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+        # This is similar to how AIX traditionally builds it's shared libraries.
+        archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bE:$export_symbols ${wl}-bnoentry${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+      fi
+    fi
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    # see comment about different semantics on the GNU ld section
+    ld_shlibs=no
+    ;;
+
+  cygwin* | mingw*)
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec=' '
+    allow_undefined_flag=unsupported
+    # Tell ltmain to make .lib files, not .a files.
+    libext=lib
+    # FIXME: Setting linknames here is a bad hack.
+    archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+    # The linker will automatically build a .lib file if we build a DLL.
+    old_archive_from_new_cmds='true'
+    # FIXME: Should let the user specify the lib program.
+    old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+    fix_srcfile_path='`cygpath -w "$srcfile"`'
+    ;;
+
+  freebsd1*)
+    ld_shlibs=no
+    ;;
+
+  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+  # support.  Future versions do this automatically, but an explicit c++rt0.o
+  # does not break anything, and helps significantly (at the cost of a little
+  # extra space).
+  freebsd2.2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+  freebsd2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+  freebsd*)
+    archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  hpux9* | hpux10* | hpux11*)
+    case $host_os in
+    hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+    *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+    esac
+    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_direct=yes
+    hardcode_minus_L=yes # Not in the search PATH, but as the default
+                        # location of the library.
+    export_dynamic_flag_spec='${wl}-E'
+    ;;
+
+  irix5* | irix6*)
+    if test "$with_gcc" = yes; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+    else
+      archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    link_all_deplibs=yes
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+    else
+      archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+    fi
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  newsos6)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linkopts'
+    hardcode_direct=yes
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_shlibpath_var=no
+    ;;
+
+  openbsd*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  os2*)
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    allow_undefined_flag=unsupported
+    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+    old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+    ;;
+
+  osf3*)
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  osf4* | osf5*)       # as osf3* with the addition of -msym flag
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib'
+      archive_expsym_cmds='for i in `cat $export_symbols`; do printf "-exported_symbol " >> $lib.exp; echo "\$i" >> $lib.exp; done; echo "-hidden">> $lib.exp~
+      $LD -shared${allow_undefined_flag} -input $lib.exp $linker_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib~$rm $lib.exp'
+
+      # cc supports -rpath directly
+      hardcode_libdir_flag_spec='-rpath $libdir'
+    fi
+    hardcode_libdir_separator=:
+    ;;
+
+  sco3.2v5*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    runpath_var=LD_RUN_PATH
+    hardcode_runpath_var=yes
+    ;;
+
+  solaris*)
+    no_undefined_flag=' -z defs'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+               $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_shlibpath_var=no
+    case $host_os in
+    solaris2.[0-5] | solaris2.[0-5].*) ;;
+    *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+      whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+    esac
+    link_all_deplibs=yes
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+    ;;
+
+  sysv4.3*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    export_dynamic_flag_spec='-Bexport'
+    ;;
+
+  sysv5*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+               $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec=
+    hardcode_shlibpath_var=no
+    runpath_var='LD_RUN_PATH'
+    ;;
+
+  uts4*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  dgux*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4*MP*)
+    if test -d /usr/nec; then
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var=no
+      runpath_var=LD_RUN_PATH
+      hardcode_runpath_var=yes
+      ld_shlibs=yes
+    fi
+    ;;
+
+  sysv4.2uw2*)
+    archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=no
+    hardcode_shlibpath_var=no
+    hardcode_runpath_var=yes
+    runpath_var=LD_RUN_PATH
+    ;;
+
+  sysv5uw7* | unixware7*)
+    no_undefined_flag='${wl}-z ${wl}text'
+    if test "$GCC" = yes; then
+      archive_cmds='$CC -shared ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    else
+      archive_cmds='$CC -G ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    fi
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    ld_shlibs=no
+    ;;
+  esac
+fi
+
+
+
+
+#################################
+## Compiler Characteristics: PIC flags, static flags, etc
+if test "X${ac_cv_prog_cc_pic+set}" = Xset; then
+  :
+else
+  ac_cv_prog_cc_pic=
+  ac_cv_prog_cc_shlib=
+  ac_cv_prog_cc_wl=
+  ac_cv_prog_cc_static=
+  ac_cv_prog_cc_no_builtin=
+  ac_cv_prog_cc_can_build_shared=$can_build_shared
+
+  if test "$with_gcc" = yes; then
+    ac_cv_prog_cc_wl='-Wl,'
+    ac_cv_prog_cc_static='-static'
+
+    case $host_os in
+    aix*)
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+        # AIX 5 now supports IA64 processor
+        lt_cv_prog_cc_static='-Bstatic'
+      else
+        lt_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+      fi
+      ;;
+    amigaos*)
+      # FIXME: we need at least 68020 code to build shared libraries, but
+      # adding the `-m68020' flag to GCC prevents building anything better,
+      # like `-m68040'.
+      ac_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+      ;;
+    beos* | irix5* | irix6* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      lt_cv_prog_cc_pic='-fno-common'
+      ;;
+    *djgpp*)
+      # DJGPP does not support shared libraries at all
+      ac_cv_prog_cc_pic=
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+        ac_cv_prog_cc_pic=-Kconform_pic
+      fi
+      ;;
+    *)
+      ac_cv_prog_cc_pic='-fPIC'
+      ;;
+    esac
+  else
+    # PORTME Check for PIC flags for the system compiler.
+    case $host_os in
+    aix*)
+     # All AIX code is PIC.
+      ac_cv_prog_cc_static="$ac_cv_prog_cc_static ${ac_cv_prog_cc_wl}-lC"
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      # Is there a better ac_cv_prog_cc_static that works with the bundled CC?
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static="${ac_cv_prog_cc_wl}-a ${ac_cv_prog_cc_wl}archive"
+      ac_cv_prog_cc_pic='+Z'
+      ;;
+
+    irix5* | irix6*)
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      # PIC (with -KPIC) is the default.
+      ;;
+
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+
+    newsos6)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      # All OSF/1 code is PIC.
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      ;;
+
+    sco3.2v5*)
+      ac_cv_prog_cc_pic='-Kpic'
+      ac_cv_prog_cc_static='-dn'
+      ac_cv_prog_cc_shlib='-belf'
+      ;;
+
+    solaris*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    sunos4*)
+      ac_cv_prog_cc_pic='-PIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Qoption ld '
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    uts4*)
+      ac_cv_prog_cc_pic='-pic'
+      ac_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+       ac_cv_prog_cc_pic='-Kconform_pic'
+       ac_cv_prog_cc_static='-Bstatic'
+      fi
+      ;;
+
+    *)
+      ac_cv_prog_cc_can_build_shared=no
+      ;;
+    esac
+  fi
+  case "$host_os" in
+      # Platforms which do not suport PIC and -DPIC is meaningless
+      # on them:
+      *djgpp*)
+        ac_cv_prog_cc_pic=
+        ;;
+      *)
+        ac_cv_prog_cc_pic="$ac_cv_prog_cc_pic -DPIC"
+        ;;
+  esac
+fi
+
+need_lc=yes
+if test "$enable_shared" = yes && test "$with_gcc" = yes; then
+  case $archive_cmds in
+  *'~'*)
+    # FIXME: we may have to deal with multi-command sequences.
+    ;;
+  '$CC '*)
+    # Test whether the compiler implicitly links with -lc since on some
+    # systems, -lgcc has to come before -lc. If gcc already passes -lc
+    # to ld, don't add -lc before -lgcc.
+    echo $ac_n "checking whether -lc should be explicitly linked in... $ac_c" 1>&6
+    if eval "test \"`echo '$''{'ac_cv_archive_cmds_needs_lc'+set}'`\" = set"; then
+      echo $ac_n "(cached) $ac_c" 1>&6
+      need_lc=$ac_cv_archive_cmds_needs_lc
+    else
+      $rm conftest*
+      echo "static int dummy;" > conftest.$ac_ext
+      if { (eval echo ltcf-c.sh:need_lc: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; }; then
+       # Append any warnings to the config.log.
+       cat conftest.err 1>&5
+       soname=conftest
+       lib=conftest
+       libobjs=conftest.$ac_objext
+       deplibs=
+       wl=$ac_cv_prog_cc_wl
+       compiler_flags=-v
+       linker_flags=-v
+       verstring=
+       output_objdir=.
+       libname=conftest
+       save_allow_undefined_flag=$allow_undefined_flag
+       allow_undefined_flag=
+       if { (eval echo ltcf-c.sh:need_lc: \"$archive_cmds\") 1>&5; (eval $archive_cmds) 2>&1 | grep " -lc " 1>&5 ; }; then
+         need_lc=no
+       fi
+       allow_undefined_flag=$save_allow_undefined_flag
+      else
+       cat conftest.err 1>&5
+      fi
+    fi
+    $rm conftest*
+    echo "$ac_t$need_lc" 1>&6
+    ;;
+  esac
+fi
+ac_cv_archive_cmds_needs_lc=$need_lc
diff --git a/config/ltconfig b/config/ltconfig

new file mode 100755 (executable)

index 0000000..1eb4072
--- /dev/null
+++ b/config/ltconfig
@@ -0,0 +1,2794 @@
+#! /bin/sh
+
+# ltconfig - Create a system-specific libtool.
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001
+# Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# A lot of this script is taken from autoconf-2.10.
+
+# Check that we are running under the correct shell.
+SHELL=${CONFIG_SHELL-/bin/sh}
+echo=echo
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell.
+  exec "$SHELL" "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# Find the correct PATH separator.  Usually this is `:', but
+# DJGPP uses `;' like DOS.
+if test "X${PATH_SEPARATOR+set}" != Xset; then
+  UNAME=${UNAME-`uname 2>/dev/null`}
+  case X$UNAME in
+    *-DOS) PATH_SEPARATOR=';' ;;
+    *)     PATH_SEPARATOR=':' ;;
+  esac
+fi
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+if test "X${echo_test_string+set}" != Xset; then
+  # find a string as large as possible, as long as the shell can cope with it
+  for cmd in 'sed 50q "$0"' 'sed 20q "$0"' 'sed 10q "$0"' 'sed 2q "$0"' 'echo test'; do
+    # expected sizes: less than 2Kb, 1Kb, 512 bytes, 16 bytes, ...
+    if (echo_test_string="`eval $cmd`") 2>/dev/null &&
+       echo_test_string="`eval $cmd`" &&
+       (test "X$echo_test_string" = "X$echo_test_string") 2>/dev/null; then
+      break
+    fi
+  done
+fi
+
+if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+   echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+   test "X$echo_testing_string" = "X$echo_test_string"; then
+  :
+else
+  # The Solaris, AIX, and Digital Unix default echo programs unquote
+  # backslashes.  This makes it impossible to quote backslashes using
+  #   echo "$something" | sed 's/\\/\\\\/g'
+  #
+  # So, first we look for a working echo in the user's PATH.
+
+  IFS="${IFS=  }"; save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR}"
+  for dir in $PATH /usr/ucb; do
+    if (test -f $dir/echo || test -f $dir/echo$ac_exeext) &&
+       test "X`($dir/echo '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`($dir/echo "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      echo="$dir/echo"
+      break
+    fi
+  done
+  IFS="$save_ifs"
+
+  if test "X$echo" = Xecho; then
+    # We didn't find a better echo, so look for alternatives.
+    if test "X`(print -r '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`(print -r "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      # This shell has a builtin print -r that does the trick.
+      echo='print -r'
+    elif (test -f /bin/ksh || test -f /bin/ksh$ac_exeext) &&
+        test "X$CONFIG_SHELL" != X/bin/ksh; then
+      # If we have ksh, try running ltconfig again with it.
+      ORIGINAL_CONFIG_SHELL="${CONFIG_SHELL-/bin/sh}"
+      export ORIGINAL_CONFIG_SHELL
+      CONFIG_SHELL=/bin/ksh
+      export CONFIG_SHELL
+      exec "$CONFIG_SHELL" "$0" --no-reexec ${1+"$@"}
+    else
+      # Try using printf.
+      echo='printf %s\n'
+      if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+        echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+        test "X$echo_testing_string" = "X$echo_test_string"; then
+       # Cool, printf works
+       :
+      elif echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+          test "X$echo_testing_string" = 'X\t' &&
+          echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+          test "X$echo_testing_string" = "X$echo_test_string"; then
+       CONFIG_SHELL="$ORIGINAL_CONFIG_SHELL"
+       export CONFIG_SHELL
+       SHELL="$CONFIG_SHELL"
+       export SHELL
+       echo="$CONFIG_SHELL $0 --fallback-echo"
+      elif echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+          test "X$echo_testing_string" = 'X\t' &&
+          echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+          test "X$echo_testing_string" = "X$echo_test_string"; then
+       echo="$CONFIG_SHELL $0 --fallback-echo"
+      else
+       # maybe with a smaller string...
+       prev=:
+
+       for cmd in 'echo test' 'sed 2q "$0"' 'sed 10q "$0"' 'sed 20q "$0"' 'sed 50q "$0"'; do
+         if (test "X$echo_test_string" = "X`eval $cmd`") 2>/dev/null; then
+           break
+         fi
+         prev="$cmd"
+       done
+
+       if test "$prev" != 'sed 50q "$0"'; then
+         echo_test_string=`eval $prev`
+         
+         export echo_test_string
+         exec "${ORIGINAL_CONFIG_SHELL-${CONFIG_SHELL-/bin/sh}}" "$0" ${1+"$@"}
+       else
+         # Oops.  We lost completely, so just stick with echo.
+         echo=echo
+       fi
+      fi
+    fi
+  fi
+fi
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e s/^X//'
+sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# The name of this program.
+progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'`
+
+# Constants:
+PROGRAM=ltconfig
+PACKAGE=libtool
+VERSION=1.4a
+TIMESTAMP=" (1.641.2.255 2001/05/22 10:39:30)"
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+rm="rm -f"
+
+help="Try \`$progname --help' for more information."
+
+# Global variables:
+default_ofile=libtool
+can_build_shared=yes
+enable_shared=yes
+# All known linkers require a `.a' archive for static linking (except M$VC,
+# which needs '.lib').
+enable_static=yes
+enable_fast_install=yes
+enable_dlopen=unknown
+enable_win32_dll=no
+pic_mode=default
+ltmain=
+silent=
+srcdir=
+ac_config_guess=
+ac_config_sub=
+host=
+build=NONE
+nonopt=NONE
+ofile="$default_ofile"
+verify_host=yes
+tagname=
+with_gcc=no
+with_gnu_ld=no
+need_locks=yes
+ac_ext=c
+libext=a
+cache_file=
+max_cmd_len=
+
+## Dependencies to place before and after the object being linked:
+predep_objects=
+postdep_objects=
+predeps=
+postdeps=
+compiler_lib_search_path=
+
+## Link characteristics:
+allow_undefined_flag=
+no_undefined_flag=
+need_lib_prefix=unknown
+need_version=unknown
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+archive_cmds=
+archive_expsym_cmds=
+old_archive_from_new_cmds=
+old_archive_from_expsyms_cmds=
+striplib=
+old_striplib=
+export_dynamic_flag_spec=
+whole_archive_flag_spec=
+thread_safe_flag_spec=
+hardcode_into_libs=no
+hardcode_libdir_flag_spec=
+hardcode_libdir_separator=
+hardcode_direct=no
+hardcode_minus_L=no
+hardcode_shlibpath_var=unsupported
+runpath_var=
+link_all_deplibs=unknown
+always_export_symbols=no
+export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | sed '\''s/.* //'\'' | sort | uniq > $export_symbols'
+# include_expsyms should be a list of space-separated symbols to be *always*
+# included in the symbol list
+include_expsyms=
+# exclude_expsyms can be an egrep regular expression of symbols to exclude
+# it will be wrapped by ` (' and `)$', so one must not match beginning or
+# end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+# as well as any symbol that contains `d'.
+exclude_expsyms="_GLOBAL_OFFSET_TABLE_"
+# Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+# platforms (ab)use it in PIC code, but their linkers get confused if
+# the symbol is explicitly referenced.  Since portable code cannot
+# rely on this symbol name, it's probably fine to never include it in
+# preloaded symbol tables.
+extract_expsyms_cmds=
+
+## Tools:
+old_AR="$AR"
+old_AR_FLAGS="$AR_FLAGS"
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+old_CPPFLAGS="$CPPFLAGS"
+old_LDFLAGS="$LDFLAGS"
+old_LIBS="$LIBS"
+old_MAGIC_CMD="$MAGIC_CMD"
+old_LD="$LD"
+old_LN_S="$LN_S"
+old_LTCC="$LTCC"
+old_NM="$NM"
+old_RANLIB="$RANLIB"
+old_STRIP="$STRIP"
+old_AS="$AS"
+old_DLLTOOL="$DLLTOOL"
+old_OBJDUMP="$OBJDUMP"
+old_OBJEXT="$OBJEXT"
+old_EXEEXT="$EXEEXT"
+old_reload_flag="$reload_flag"
+old_deplibs_check_method="$deplibs_check_method"
+old_file_magic_cmd="$file_magic_cmd"
+
+# Parse the command line options.
+args=
+prev=
+for option
+do
+  case $option in
+  -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
+  *) optarg= ;;
+  esac
+
+  # If the previous option needs an argument, assign it.
+  if test -n "$prev"; then
+    eval "$prev=\$option"
+    prev=
+    continue
+  fi
+
+  case $option in
+  --help) cat <<EOM
+Usage: $progname [OPTION]... LTMAIN [HOST]
+
+Generate a system-specific libtool script.
+
+    --build                configure for building on BUILD [BUILD=HOST]
+    --debug                enable verbose shell tracing
+    --disable-shared       do not build shared libraries
+    --disable-static       do not build static libraries
+    --disable-fast-install do not optimize for fast installation
+    --enable-dlopen        enable dlopen support
+    --enable-win32-dll     enable building dlls on win32 hosts
+    --help                 display this help and exit
+    --no-verify            do not verify that HOST is a valid host type
+-o, --output=FILE          specify the output file [default=$default_ofile]
+    --quiet                same as \`--silent'
+    --silent               do not print informational messages
+    --srcdir=DIR           find \`config.guess' in DIR
+    --version              output version information and exit
+    --add-tag=TAG          append an alternate configuration
+    --with-gcc             assume that the GNU C compiler will be used
+    --with-gnu-ld          assume that the C compiler uses the GNU linker
+    --prefer-pic           try to use only PIC objects
+    --prefer-non-pic       try to use only non-PIC objects
+    --disable-lock         disable file locking
+    --cache-file=FILE      configure cache file
+
+LTMAIN is the \`ltmain.sh' shell script fragment or \`ltmain.c' program
+that provides basic libtool functionality.
+
+HOST is the canonical host system name [default=guessed].
+EOM
+  exit 0
+  ;;
+
+  --build) prev=build ;;
+  --build=*) build="$optarg" ;;
+
+  --debug)
+    echo "$progname: enabling shell trace mode"
+    set -x
+    ;;
+
+  --disable-shared) enable_shared=no ;;
+
+  --disable-static) enable_static=no ;;
+
+  --disable-fast-install) enable_fast_install=no ;;
+
+  --enable-dlopen) enable_dlopen=yes ;;
+
+  --enable-win32-dll) enable_win32_dll=yes ;;
+
+  --quiet | --silent) silent=yes ;;
+
+  --srcdir) prev=srcdir ;;
+  --srcdir=*) srcdir="$optarg" ;;
+
+  --no-verify) verify_host=no ;;
+
+  --output | -o) prev=ofile ;;
+  --output=*) ofile="$optarg" ;;
+
+  --version) echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"; exit 0 ;;
+
+  --add-tag) prev=tagname ;;
+  --add-tag=*) tagname="$optarg" ;;
+
+  --with-gcc) with_gcc=yes ;;
+  --with-gnu-ld) with_gnu_ld=yes ;;
+
+  --prefer-pic) pic_mode=yes ;;
+  --prefer-non-pic) pic_mode=no ;;
+
+  --disable-lock) need_locks=no ;;
+
+  --cache-file=*) cache_file="$optarg" ;;
+
+  -*)
+    echo "$progname: unrecognized option \`$option'" 1>&2
+    echo "$help" 1>&2
+    exit 1
+    ;;
+
+  *)
+    if test -z "$ltmain"; then
+      ltmain="$option"
+    elif test -z "$host"; then
+# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1
+#      if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then
+#        echo "$progname: warning \`$option' is not a valid host type" 1>&2
+#      fi
+      host="$option"
+    else
+      echo "$progname: too many arguments" 1>&2
+      echo "$help" 1>&2
+      exit 1
+    fi ;;
+  esac
+done
+
+if test -z "$ltmain"; then
+  echo "$progname: you must specify a LTMAIN file" 1>&2
+  echo "$help" 1>&2
+  exit 1
+fi
+
+if test ! -f "$ltmain"; then
+  echo "$progname: \`$ltmain' does not exist" 1>&2
+  echo "$help" 1>&2
+  exit 1
+fi
+
+if test -n "$tagname"; then
+  # Check whether tagname contains only valid characters
+  case `$echo "X$tagname" | $Xsed -e 's/[-_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890,/]//g'` in
+  "") ;;
+  *)
+    echo "$progname: invalid tag name: $tagname" 1>&2
+    exit 1
+    ;;
+  esac
+
+  if grep "^### BEGIN LIBTOOL TAG CONFIG: $tagname$" < "$ofile" > /dev/null; then
+    echo "$progname: tag name $tagname already exists" 1>&2
+    exit 1
+  fi
+
+  if test ! -f "$ofile"; then
+    echo "$progname: warning: output file \`$ofile' does not exist" 1>&2
+  fi
+
+  if test -z "$LTCC"; then
+    eval "`$SHELL $ofile --config | grep '^LTCC='`"
+    if test -z "$LTCC"; then
+      echo "$progname: warning: output file \`$ofile' does not look like a libtool script" 1>&2
+    else
+      echo "$progname: warning: using \`LTCC=$LTCC', extracted from \`$ofile'" 1>&2
+    fi
+  fi
+fi
+
+# Quote any args containing shell metacharacters.
+ltconfig_args=
+for arg
+do
+  case $arg in
+  *" "*|*"     "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
+  ltconfig_args="$ltconfig_args '$arg'" ;;
+  *) ltconfig_args="$ltconfig_args $arg" ;;
+  esac
+done
+
+# A relevant subset of AC_INIT.
+
+# File descriptor usage:
+# 0 standard input
+# 1 file creation
+# 2 errors and warnings
+# 3 some systems may open it to /dev/tty
+# 4 used on the Kubota Titan
+# 5 compiler messages saved in config.log
+# 6 checking for... messages and results
+if test "$silent" = yes; then
+  exec 6>/dev/null
+else
+  exec 6>&1
+fi
+exec 5>>./config.log
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+if test "X${LC_ALL+set}" = Xset; then LC_ALL=C; export LC_ALL; fi
+if test "X${LANG+set}"   = Xset; then LANG=C;   export LANG;   fi
+
+if test -n "$cache_file" && test -r "$cache_file" && test -f "$cache_file"; then
+  echo "loading cache $cache_file within ltconfig"
+  . $cache_file
+fi
+
+if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
+  # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
+  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
+    ac_n= ac_c='
+' ac_t='       '
+  else
+    ac_n=-n ac_c= ac_t=
+  fi
+else
+  ac_n= ac_c='\c' ac_t=
+fi
+
+if test -z "$srcdir"; then
+  # Assume the source directory is the same one as the path to LTMAIN.
+  srcdir=`$echo "X$ltmain" | $Xsed -e 's%/[^/]*$%%'`
+  test "$srcdir" = "$ltmain" && srcdir=.
+fi
+
+trap "$rm conftest*; exit 1" 1 2 15
+if test "$verify_host" = yes; then
+  # Check for config.guess and config.sub.
+  ac_aux_dir=
+  for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
+    if test -f $ac_dir/config.guess; then
+      ac_aux_dir=$ac_dir
+      break
+    fi
+  done
+  if test -z "$ac_aux_dir"; then
+    echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2
+    echo "$help" 1>&2
+    exit 1
+  fi
+  ac_config_guess=$ac_aux_dir/config.guess
+  ac_config_sub=$ac_aux_dir/config.sub
+
+  # Make sure we can run config.sub.
+  if $SHELL $ac_config_sub sun4 >/dev/null 2>&1; then :
+  else
+    echo "$progname: cannot run $ac_config_sub" 1>&2
+    echo "$help" 1>&2
+    exit 1
+  fi
+
+  echo $ac_n "checking host system type""... $ac_c" 1>&6
+
+  host_alias=$host
+  case $host_alias in
+  "")
+    # Force config.guess to use the C compiler.
+    # CC_FOR_BUILD overrides the CC variable in config.guess but I had
+    # problems with it so do it this way for now.
+    CC="$LTCC"
+
+    if host_alias=`$SHELL $ac_config_guess`; then :
+    else
+      echo "$progname: cannot guess host type; you must specify one" 1>&2
+      echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Restore the C compiler.
+    CC="$old_CC"
+    ;;
+  esac
+  host=`$SHELL $ac_config_sub $host_alias`
+  echo "$ac_t$host" 1>&6
+
+  # Make sure the host verified.
+  test -z "$host" && exit 1
+
+  # Check for the build system type
+  echo $ac_n "checking build system type... $ac_c" 1>&6
+
+  build_alias=$build
+  case $build_alias in
+  NONE)
+    case $nonopt in
+    NONE) build_alias=$host_alias ;;
+    *) build_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  build=`$SHELL $ac_config_sub $build_alias`
+  build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+  echo "$ac_t""$build" 1>&6
+
+elif test -z "$host"; then
+  echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2
+  echo "$help" 1>&2
+  exit 1
+else
+  host_alias=$host
+  build_alias=$host_alias
+  build=$host
+fi
+
+if test x"$host" != x"$build"; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+
+# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
+case $host_os in
+linux-gnu*) ;;
+linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
+esac
+
+case $host_os in
+aix3*)
+  # AIX sometimes has problems with the GCC collect2 program.  For some
+  # reason, if we set the COLLECT_NAMES environment variable, the problems
+  # vanish in a puff of smoke.
+  if test "X${COLLECT_NAMES+set}" != Xset; then
+    COLLECT_NAMES=
+    export COLLECT_NAMES
+  fi
+  ;;
+esac
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+if test -n "$RANLIB"; then
+  old_archive_cmds="$old_archive_cmds~\$RANLIB \$oldlib"
+  old_postinstall_cmds="\$RANLIB \$oldlib~$old_postinstall_cmds"
+fi
+
+# Source the script associated with the $tagname tag configuration.
+if test -n "$tagname"; then
+  . $ltmain
+else
+  # FIXME:  We should use a variable here
+  # Configure for a C compiler
+  . $srcdir/ltcf-c.sh
+fi
+
+# Set sane defaults for various variables
+test -z "$AR" && AR=ar
+test -z "$AR_FLAGS" && AR_FLAGS=cru
+test -z "$AS" && AS=as
+test -z "$CC" && CC=cc
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+test -z "$MAGIC_CMD" && MAGIC_CMD=file
+test -z "$LD" && LD=ld
+test -z "$LN_S" && LN_S="ln -s"
+test -z "$NM" && NM=nm
+test -z "$OBJDUMP" && OBJDUMP=objdump
+test -z "$RANLIB" && RANLIB=:
+test -z "$STRIP" && STRIP=:
+test -z "$objext" && objext=o
+
+echo $ac_n "checking for objdir... $ac_c" 1>&6
+rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+  objdir=.libs
+else
+  # MS-DOS does not allow filenames that begin with a dot.
+  objdir=_libs
+fi
+rmdir .libs 2>/dev/null
+echo "$ac_t$objdir" 1>&6
+
+# If no C compiler was specified, use CC.
+LTCC=${LTCC-"$CC"}
+
+# Allow CC to be a program name with arguments.
+set dummy $CC
+compiler="$2"
+
+# We assume here that the value for ac_cv_prog_cc_pic will not be cached
+# in isolation, and that seeing it set (from the cache) indicates that
+# the associated values are set (in the cache) correctly too.
+echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6
+echo "$progname:678:checking for $compiler option to produce PIC" 1>&5
+
+if test -z "$ac_cv_prog_cc_pic"; then
+  echo "$ac_t"none 1>&6
+else
+  echo "$ac_t""$ac_cv_prog_cc_pic" 1>&6
+
+  # Check to make sure the pic_flag actually works.
+  echo $ac_n "checking if $compiler PIC flag $ac_cv_prog_cc_pic works... $ac_c" 1>&6
+  echo "$progname:687:checking that $compiler PIC flag $ac_cv_prog_cc_pic works." 1>&5
+  if test "X${ac_cv_prog_cc_pic_works+set}" = Xset && \
+     test "X${ac_cv_prog_cc_pic_works}" != X; then
+    echo $ac_n "(cached) $ac_c" 1>&6
+  else
+    ac_cv_prog_cc_pic_works=yes
+    $rm conftest*
+    echo $lt_simple_compile_test_code > conftest.$ac_ext
+    save_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS $ac_cv_prog_cc_pic -DPIC"
+    if { (eval echo $progname:697: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.$objext; then
+      # Append any warnings to the config.log.
+      cat conftest.err 1>&5
+
+      case $host_os in
+      hpux9* | hpux10* | hpux11*)
+       # On HP-UX, both CC and GCC only warn that PIC is supported... then
+       # they create non-PIC objects.  So, if there were any warnings, we
+       # assume that PIC is not supported.
+       if test -s conftest.err; then
+         ac_cv_prog_cc_pic_works=no
+         ac_cv_prog_cc_can_build_shared=no
+         ac_cv_prog_cc_pic=
+       else
+         ac_cv_prog_cc_pic_works=yes
+         ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+       fi
+       ;;
+      *)
+       ac_cv_prog_cc_pic_works=yes
+       ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+       ;;
+      esac
+    else
+      # Append any errors to the config.log.
+      cat conftest.err 1>&5
+      ac_cv_prog_cc_pic_works=no
+      ac_cv_prog_cc_can_build_shared=no
+      ac_cv_prog_cc_pic=
+    fi
+    CFLAGS="$save_CFLAGS"
+    $rm conftest*
+  fi
+  # Belt *and* braces to stop my trousers falling down:
+  if test "X$ac_cv_prog_cc_pic_works" = Xno; then
+    ac_cv_prog_cc_pic=
+    ac_cv_prog_cc_can_build_shared=no
+  fi
+  echo "$ac_t""$ac_cv_prog_cc_pic_works" 1>&6
+fi
+
+# Check for any special shared library compilation flags.
+if test -n "$ac_cv_prog_cc_shlib"; then
+  echo "$progname: warning: \`$CC' requires \`$ac_cv_prog_cc_shlib' to build shared libraries" 1>&2
+  if echo "$old_CC $old_CFLAGS " | egrep -e "[         ]$ac_cv_prog_cc_shlib[  ]" >/dev/null; then :
+  else
+    echo "$progname: add \`$ac_cv_prog_cc_shlib' to the CC or CFLAGS env variable and reconfigure" 1>&2
+    ac_cv_prog_cc_can_build_shared=no
+  fi
+fi
+
+echo $ac_n "checking if $compiler static flag $ac_cv_prog_cc_static works... $ac_c" 1>&6
+echo "$progname:749: checking if $compiler static flag $ac_cv_prog_cc_static works" >&5
+if test "X${ac_cv_prog_cc_static_works+set}" = Xset && \
+   test "X${ac_cv_prog_cc_static_works}" != X; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  $rm conftest*
+  echo $lt_simple_link_test_code > conftest.$ac_ext
+  save_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS $ac_cv_prog_cc_static"
+  if { (eval echo $progname:758: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+    ac_cv_prog_cc_static_works=yes
+  else
+    ac_cv_prog_cc_static_works=no
+    ac_cv_prog_cc_static=
+  fi
+  LDFLAGS="$save_LDFLAGS"
+  $rm conftest*
+fi
+# Belt *and* braces to stop my trousers falling down:
+if test "X$ac_cv_prog_cc_static_works" = Xno; then
+  ac_cv_prog_cc_static=
+fi
+echo "$ac_t""$ac_cv_prog_cc_static_works" 1>&6
+pic_flag="$ac_cv_prog_cc_pic"
+special_shlib_compile_flags="$ac_cv_prog_cc_shlib"
+wl="$ac_cv_prog_cc_wl"
+link_static_flag="$ac_cv_prog_cc_static"
+no_builtin_flag="$ac_cv_prog_cc_no_builtin"
+can_build_shared="$ac_cv_prog_cc_can_build_shared"
+
+# find the maximum length of command line arguments
+echo "$progname:780: finding the maximum length of command line arguments" 1>&5
+echo $ac_n "finding the maximum length of command line arguments... $ac_c" 1>&6
+if test "${lt_cv_sys_max_cmd_len+set}" = set; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  i=0
+  testring="ABCD"
+  # If test is not a shell built-in, we'll probably end up computing a
+  # maximum length that is only half of the actual maximum length, but
+  # we can't tell.
+  while test "X"`$CONFIG_SHELL $0 --fallback-echo "X$testring" 2>/dev/null` \
+             = "XX$testring" &&
+          new_result=`expr "X$testring" : ".*" 2>&1` &&
+          lt_cv_sys_max_cmd_len=$new_result &&
+          test $i != 18 # 1 MB should be enough
+  do
+    i=`expr $i + 1`
+    testring=$testring$testring
+  done
+  testring=
+  # add a significant safety factor because C++ compilers can tack on massive amounts
+  # of additional arguments before passing them to the linker.  1/4 should be good.
+  len=`expr $lt_cv_sys_max_cmd_len \/ 4`
+  lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len - $len`
+fi
+echo "$progname:@lineno@: result: $lt_cv_sys_max_cmd_len" 1>&5
+echo "${ac_t}$lt_cv_sys_max_cmd_len" 1>&6
+
+if test -n $lt_cv_sys_max_cmd_len ; then
+  max_cmd_len=$lt_cv_sys_max_cmd_len
+else
+  max_cmd_len=none
+fi
+
+# Check to see if options -o and -c are simultaneously supported by compiler
+echo $ac_n "checking if $compiler supports -c -o file.$objext... $ac_c" 1>&6
+if test "${lt_cv_compiler_c_o+set}" = set; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  $rm -r conftest 2>/dev/null
+  mkdir conftest
+  cd conftest
+  $rm conftest*
+  echo $lt_simple_compile_test_code > conftest.$ac_ext
+  mkdir out
+  # According to Tom Tromey, Ian Lance Taylor reported there are C compilers
+  # that will create temporary files in the current directory regardless of
+  # the output directory.  Thus, making CWD read-only will cause this test
+  # to fail, enabling locking or at least warning the user not to do parallel
+  # builds.
+  chmod -w .
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -o out/conftest2.$objext"
+  echo "$progname:833: checking if $compiler supports -c -o file.$objext" >&5
+  if { (eval echo $progname:834: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$objext; then
+
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+      if test -s out/conftest.err; then
+        lt_cv_compiler_c_o=no
+      else
+        lt_cv_compiler_c_o=yes
+      fi
+  else
+    # Append any errors to the config.log.
+    cat out/conftest.err 1>&5
+    lt_cv_compiler_c_o=no
+  fi
+  CFLAGS="$save_CFLAGS"
+  chmod u+w .
+  $rm conftest* out/*
+  rmdir out
+  cd ..
+  rmdir conftest
+  $rm -r conftest 2>/dev/null
+fi
+compiler_c_o=$lt_cv_compiler_c_o
+echo "${ac_t}$compiler_c_o" 1>&6
+
+# Check to see if we can do hard links to lock some files if needed
+hard_links="nottested"
+if test "$compiler_c_o" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  echo $ac_n "checking if we can lock with hard links... $ac_c" 1>&6
+  hard_links=yes
+  $rm conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  echo "$ac_t$hard_links" 1>&6
+  $rm conftest*
+  if test "$hard_links" = no; then
+    echo "*** WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+if test "$with_gcc" = yes; then
+  # Check to see if options -fno-rtti -fno-exceptions are supported by compiler
+  echo $ac_n "checking if $compiler supports -fno-rtti -fno-exceptions ... $ac_c" 1>&6
+  $rm conftest*
+  echo $lt_simple_compile_test_code > conftest.$ac_ext
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -c conftest.$ac_ext"
+  echo "$progname:887: checking if $compiler supports -fno-rtti -fno-exceptions" >&5
+  if { (eval echo $progname:888: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.$objext; then
+
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+      if test -s conftest.err; then
+       echo "$ac_t"no 1>&6
+       compiler_rtti_exceptions=no
+      else
+       echo "$ac_t"yes 1>&6
+       compiler_rtti_exceptions=yes
+      fi
+  else
+    # Append any errors to the config.log.
+    cat conftest.err 1>&5
+    compiler_rtti_exceptions=no
+    echo "$ac_t"no 1>&6
+  fi
+  CFLAGS="$save_CFLAGS"
+  $rm conftest*
+
+  if test "$compiler_rtti_exceptions" = "yes"; then
+    no_builtin_flag=' -fno-builtin -fno-rtti -fno-exceptions'
+  else
+    no_builtin_flag=' -fno-builtin'
+  fi
+  
+fi
+
+# See if the linker supports building shared libraries.
+echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6
+
+echo "$ac_t$ld_shlibs" 1>&6
+test "$ld_shlibs" = no && can_build_shared=no
+
+# Check hardcoding attributes.
+echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6
+hardcode_action=
+if test -n "$hardcode_libdir_flag_spec" || \
+   test -n "$runpath_var"; then
+
+  # We can hardcode non-existant directories.
+  if test "$hardcode_direct" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$hardcode_shlibpath_var" != no &&
+     test "$hardcode_minus_L" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action=unsupported
+fi
+echo "$ac_t$hardcode_action" 1>&6
+
+echo $ac_n "checking whether stripping libraries is possible... $ac_c" 1>&6
+if test -n "$STRIP" && $STRIP -V 2>&1 | grep "GNU strip" >/dev/null; then
+  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+  echo "${ac_t}yes" 1>&6
+else
+  echo "${ac_t}no" 1>&6
+fi
+
+case $reload_flag in
+"" | " "*) ;;
+*) reload_flag=" $reload_flag" ;;
+esac
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+# PORTME Fill in your ld.so characteristics
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+
+echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6
+case $host_os in
+aix3*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX 3 has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}.so$major'
+  ;;
+
+aix4* | aix5*)
+  if test "$host_cpu" = ia64; then
+    # AIX 5 supports IA64
+    library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+    shlibpath_var=LD_LIBRARY_PATH
+  else
+    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
+    # soname into executable. Probably we can add versioning support to
+    # collect2, so additional links can be useful in future.
+    # We preserve .a as extension for shared libraries though AIX4.2
+    # and later linker supports .so
+    if test "$aix_use_runtimelinking" = yes; then
+      # If using run time linking (on AIX 4.2 or later) use lib<name>.so instead of
+      # lib<name>.a to let people know that these are not typical AIX shared libraries.
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+    else
+      # We preserve .a as extension for shared libraries though AIX4.2
+      # and later when we are not doing run time linking.
+      library_names_spec='${libname}${release}.a $libname.a'
+      soname_spec='${libname}${release}.so$major.o'
+    fi
+    # If we're using GNU nm, then we don't want the "-C" option.
+    # -C means demangle to AIX nm, but means don't demangle with GNU nm
+    if $NM -V 2>&1 | egrep '(GNU)' > /dev/null; then
+      export_symbols_cmds='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$2 == "T") || (\$2 == "D") || (\$2 == "B")) && (substr(\$3,1,1) != ".")) { print \$3 } }'\'' | sort -u > $export_symbols'
+    else
+      export_symbols_cmds='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$2 == "T") || (\$2 == "D") || (\$2 == "B")) && (substr(\$3,1,1) != ".")) { print \$3 } }'\'' | sort -u > $export_symbols'
+    fi
+    shlibpath_var=LIBPATH
+    deplibs_check_method=pass_all
+    case $host_os in
+    aix4 | aix4.[01] | aix4.[01].*)
+      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+          echo ' yes '
+          echo '#endif'; } | ${CC} -E - | grep yes > /dev/null; then
+       :
+      else
+       # With GCC up to 2.95.x, collect2 would create an import file
+       # for dependence libraries.  The import file would start with
+       # the line `#! .'.  This would cause the generated library to
+       # depend on `.', always an invalid library.  This was fixed in
+       # development snapshots of GCC prior to 3.0.
+        can_build_shared=no
+      fi
+      ;;
+    esac
+  fi
+  ;;
+
+amigaos*)
+  library_names_spec='$libname.ixlibrary $libname.a'
+  # Create ${libname}_ixlibrary.a entries in /sys/libs.
+  finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
+  ;;
+
+beos*)
+  library_names_spec='${libname}.so'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  lt_cv_dlopen="load_add_on"
+  lt_cv_dlopen_libs=
+  lt_cv_dlopen_self=yes
+  ;;
+
+bsdi4*)
+  version_type=linux
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  export_dynamic_flag_spec=-rdynamic
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw* | pw32*)
+  version_type=windows
+  need_version=no
+  need_lib_prefix=no
+  case $with_gcc,$host_os in
+  yes,cygwin*)
+    library_names_spec='$libname.dll.a'
+    soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | [sed -e 's/[.]/-/g']`${versuffix}.dll'
+    postinstall_cmds='dlpath=`bash 2>&1 -c '\''. $dir/${file}i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog .libs/$dlname \$dldir/$dlname'
+    postuninstall_cmds='dldll=`bash 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll; $rm \$dlpath'
+    ;;
+  yes,mingw*)
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+    sys_lib_search_path_spec=`$CC -print-search-dirs | grep "^libraries:" | sed -e "s/^libraries://" -e "s/;/ /g"`
+    ;;
+  yes,pw32*)
+    library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+;;
+  *)
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll $libname.lib'
+    ;;
+  esac
+  dynamic_linker='Win32 ld.exe'
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  lt_cv_dlopen="LoadLibrary"
+  lt_cv_dlopen_libs=
+  ;;
+
+darwin* | rhapsody*)
+  dynamic_linker="$host_os dyld"
+  version_type=darwin
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${versuffix}.`test .$module = .yes && echo so || echo dylib` ${libname}${release}${major}.$`test .$module = .yes && echo so || echo dylib` ${libname}.`test .$module = .yes && echo so || echo dylib`'
+  soname_spec='${libname}${release}${major}.`test .$module = .yes && echo so || echo dylib`'
+  shlibpath_overrides_runpath=yes
+  shlibpath_var=DYLD_LIBRARY_PATH
+  ;;
+
+freebsd1*)
+  dynamic_linker=no
+  ;;
+
+freebsd*)
+  objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
+  version_type=freebsd-$objformat
+  case $version_type in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+      need_version=no
+      need_lc=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}.so$versuffix $libname.so$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_os in
+  freebsd2*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  *)
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so${major} ${libname}.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  dynamic_linker="$host_os dld.sl"
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  shlibpath_var=SHLIB_PATH
+  shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+  library_names_spec='${libname}${release}.sl$versuffix ${libname}${release}.sl$major $libname.sl'
+  soname_spec='${libname}${release}.sl$major'
+  # HP-UX runs *really* slowly unless shared libraries are mode 555.
+  postinstall_cmds='chmod 555 $lib'
+  ;;
+
+irix5* | irix6*)
+  version_type=irix
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so $libname.so'
+  case $host_os in
+  irix5*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case $LD in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 ") libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 ") libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 ") libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
+  dynamic_linker=no
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  # We used to test for /lib/ld.so.1 and disable shared libraries on
+  # powerpc, because MkLinux only supported shared libraries with the
+  # GNU dynamic linker.  Since this was broken with cross compilers,
+  # most powerpc-linux boxes support dynamic linking these days and
+  # people can always --disable-shared, the test was removed, and we
+  # assume the GNU/Linux dynamic linker is in use.
+  dynamic_linker='GNU/Linux ld.so'
+  ;;
+
+netbsd*)
+  need_lib_prefix=no
+  need_version=no
+  version_type=sunos
+  if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so ${libname}.so'
+    soname_spec='${libname}${release}.so$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  ;;
+
+newsos6)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  ;;
+
+openbsd*)
+  version_type=sunos
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+    need_version=no
+  fi
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+os2*)
+  libname_spec='$name'
+  need_lib_prefix=no
+  library_names_spec='$libname.dll $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_version=no
+  soname_spec='${libname}${release}.so'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  ;;
+
+sco3.2v5*)
+  version_type=osf
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+solaris*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_vendor in
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+uts4*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+dgux*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux
+    library_names_spec='$libname.so.$versuffix $libname.so.$major $libname.so'
+    soname_spec='$libname.so.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+echo "$ac_t$dynamic_linker" 1>&6
+test "$dynamic_linker" = no && can_build_shared=no
+
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6
+
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[BCDEGRST]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
+
+# Transform the above into a raw symbol and a C symbol.
+symxfrm='\1 \2\3 \3'
+
+# Transform an extracted symbol line into a proper C declaration
+global_symbol_to_cdecl="sed -n -e 's/^. .* \(.*\)$/extern char \1;/p'"
+
+# Define system-specific variables.
+case $host_os in
+aix*)
+  symcode='[BCDT]'
+  ;;
+cygwin* | mingw* | pw32*)
+  symcode='[ABCDGISTW]'
+  ;;
+hpux*) # Its linker distinguishes data from code symbols
+  global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern char \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
+  ;;
+irix*)
+  symcode='[BCDEGRST]'
+  ;;
+solaris* | sysv5*)
+  symcode='[BDT]'
+  ;;
+sysv4)
+  symcode='[DFNSTU]'
+  ;;
+esac
+
+# Handle CRLF in mingw tool chain
+opt_cr=
+case $host_os in
+mingw*)
+  opt_cr=`echo 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+  ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
+  symcode='[ABCDGISTW]'
+fi
+
+# Try without a prefix undercore, then with it.
+for ac_symprfx in "" "_"; do
+
+  # Write the raw and C identifiers.
+  global_symbol_pipe="sed -n -e 's/^.*[        ]\($symcode$symcode*\)[         ][      ]*\($ac_symprfx\)$sympat$opt_cr$/$symxfrm/p'"
+
+  # Check to see that the pipe works correctly.
+  pipe_works=no
+  $rm conftest*
+  cat > conftest.$ac_ext <<EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(){}
+#ifdef __cplusplus
+}
+#endif
+int main(){nm_test_var='a';nm_test_func();return(0);}
+EOF
+
+  echo "$progname:1430: checking if global_symbol_pipe works" >&5
+  if { (eval echo $progname:1431: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.$objext; then
+    # Now try to grab the symbols.
+    nlist=conftest.nm
+    if { echo "$progname:1434: eval \"$NM conftest.$objext | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.$objext | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then
+
+      # Try sorting and uniquifying the output.
+      if sort "$nlist" | uniq > "$nlist"T; then
+       mv -f "$nlist"T "$nlist"
+      else
+       rm -f "$nlist"T
+      fi
+
+      # Make sure that we snagged all the symbols we need.
+      if egrep ' nm_test_var$' "$nlist" >/dev/null; then
+       if egrep ' nm_test_func$' "$nlist" >/dev/null; then
+         cat <<EOF > conftest.$ac_ext
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+         # Now generate the symbol file.
+         eval "$global_symbol_to_cdecl"' < "$nlist" >> conftest.$ac_ext'
+
+         cat <<EOF >> conftest.$ac_ext
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{
+EOF
+         sed "s/^$symcode$symcode* \(.*\) \(.*\)$/  {\"\2\", (lt_ptr_t) \&\2},/" < "$nlist" >> conftest.$ac_ext
+         cat <<\EOF >> conftest.$ac_ext
+  {0, (lt_ptr_t) 0}
+};
+
+#ifdef __cplusplus
+}
+#endif
+EOF
+         # Now try linking the two files.
+         mv conftest.$objext conftstm.$objext
+         save_LIBS="$LIBS"
+         save_CFLAGS="$CFLAGS"
+         LIBS="conftstm.$objext"
+         CFLAGS="$CFLAGS$no_builtin_flag"
+         if { (eval echo $progname:1486: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+           pipe_works=yes
+         else
+           echo "$progname: failed program was:" >&5
+           cat conftest.$ac_ext >&5
+         fi
+         LIBS="$save_LIBS"
+       else
+         echo "cannot find nm_test_func in $nlist" >&5
+       fi
+      else
+       echo "cannot find nm_test_var in $nlist" >&5
+      fi
+    else
+      echo "cannot run $global_symbol_pipe" >&5
+    fi
+  else
+    echo "$progname: failed program was:" >&5
+    cat conftest.$ac_ext >&5
+  fi
+  $rm conftest* conftst*
+
+  # Do not use the global_symbol_pipe unless it works.
+  if test "$pipe_works" = yes; then
+    break
+  else
+    global_symbol_pipe=
+  fi
+done
+if test "$pipe_works" = yes; then
+  echo "${ac_t}ok" 1>&6
+else
+  echo "${ac_t}failed" 1>&6
+fi
+
+if test -z "$global_symbol_pipe"; then
+  global_symbol_to_cdecl=
+fi
+
+# Report the final consequences.
+echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6
+
+# Only try to build win32 dlls if AC_LIBTOOL_WIN32_DLL was used in
+# configure.in, otherwise build static only libraries.
+case $host_os in
+cygwin* | mingw* | pw32* | os2*)
+  if test x$can_build_shared = xyes; then
+    test x$enable_win32_dll = xno && can_build_shared=no
+    echo "checking if package supports dlls... $can_build_shared" 1>&6
+  fi
+;;
+esac
+
+echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6
+test "$can_build_shared" = "no" && enable_shared=no
+
+# On AIX, shared libraries and static libraries use the same namespace, and
+# are all built from PIC.
+case $host_os in
+aix3*)
+  test "$enable_shared" = yes && enable_static=no
+  if test -n "$RANLIB"; then
+    archive_cmds="$archive_cmds~\$RANLIB \$lib"
+    postinstall_cmds='$RANLIB $lib'
+  fi
+  ;;
+
+aix4*)
+  test "$enable_shared" = yes && enable_static=no
+  ;;
+esac
+
+echo "$ac_t$enable_shared" 1>&6
+
+# Make sure either enable_shared or enable_static is yes.
+test "$enable_shared" = yes || enable_static=yes
+
+echo "checking whether to build static libraries... $enable_static" 1>&6
+
+if test "$hardcode_action" = relink; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$with_gcc" = yes; then
+  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+fi
+
+# Check whether we must set pic_mode to default
+test -z "$pic_flag" && pic_mode=default
+
+if test "x$enable_dlopen" != xyes; then
+  enable_dlopen=unknown
+  enable_dlopen_self=unknown
+  enable_dlopen_self_static=unknown
+else
+if test "X${lt_cv_dlopen+set}" != Xset; then
+  lt_cv_dlopen=no lt_cv_dlopen_libs=
+echo $ac_n "checking for dlopen in -ldl""... $ac_c" 1>&6
+echo "$progname:1590: checking for dlopen in -ldl" >&5
+if test "X${ac_cv_lib_dl_dlopen+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldl  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1597 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo $progname:1610: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dl_dlopen=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dl_dlopen=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dl_dlopen" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen""... $ac_c" 1>&6
+echo "$progname:1629: checking for dlopen" >&5
+if test "X${ac_cv_func_dlopen+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 1634 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char dlopen(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_dlopen) || defined (__stub___dlopen)
+choke me
+#else
+dlopen();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:1659: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_func_dlopen=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_func_dlopen=no
+fi
+rm -f conftest*
+fi
+if test "X$ac_cv_func_dlopen" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen in -lsvld""... $ac_c" 1>&6
+echo "$progname:1676: checking for dlopen in -lsvld" >&5
+if test "X${ac_cv_lib_svld_dlopen+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-lsvld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1683 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo $progname:1696: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_svld_dlopen=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_svld_dlopen=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_svld_dlopen" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dld_link in -ldld""... $ac_c" 1>&6
+echo "$progname:1715: checking for dld_link in -ldld" >&5
+if test "X${ac_cv_lib_dld_dld_link+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1722 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dld_link();
+
+int main() {
+dld_link()
+; return 0; }
+EOF
+if { (eval echo $progname:1735: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dld_dld_link=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dld_dld_link=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_dld_link" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load""... $ac_c" 1>&6
+echo "$progname:1754: checking for shl_load" >&5
+if test "X${ac_cv_func_shl_load+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 1759 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char shl_load(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_shl_load) || defined (__stub___shl_load)
+choke me
+#else
+shl_load();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:1784: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_func_shl_load=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_func_shl_load=no
+fi
+rm -f conftest*
+fi
+
+if test "X$ac_cv_func_shl_load" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load in -ldld""... $ac_c" 1>&6
+echo "$progname:1802: checking for shl_load in -ldld" >&5
+if test "X${ac_cv_lib_dld_shl_load+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 1809 "ltconfig"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+shl_load()
+; return 0; }
+EOF
+if { (eval echo $progname:1823: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dld_shl_load=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dld_shl_load=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_shl_load" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+fi
+
+fi
+
+  if test "x$lt_cv_dlopen" != xno; then
+    enable_dlopen=yes
+  else
+    enable_dlopen=no
+  fi
+
+  case $lt_cv_dlopen in
+  dlopen)
+for ac_hdr in dlfcn.h; do
+ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
+echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
+echo "$progname:1870: checking for $ac_hdr" >&5
+if eval "test \"`echo 'X$''{'ac_cv_header_$ac_safe'+set}'`\" = Xset"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 1875 "ltconfig"
+#include <$ac_hdr>
+int fnord = 0;
+int main () { return(0); }
+EOF
+ac_try="$ac_compile >/dev/null 2>conftest.out"
+{ (eval echo $progname:1881: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=yes"
+else
+  echo "$ac_err" >&5
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=no"
+fi
+rm -f conftest*
+fi
+if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+done
+
+    if test "x$ac_cv_header_dlfcn_h" = xyes; then
+      CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+    fi
+    eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+    LIBS="$lt_cv_dlopen_libs $LIBS"
+
+  echo $ac_n "checking whether a program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:1909: checking whether a program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$cross_compiling" = yes; then
+    lt_cv_dlopen_self=cross
+  else
+    cat > conftest.$ac_ext <<EOF
+#line 1917 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL   RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+#  define LTDL_GLOBAL  DL_GLOBAL
+# else
+#  define LTDL_GLOBAL  0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+#  define LTDL_LAZY_OR_NOW     RTLD_LAZY
+# else
+#  ifdef DL_LAZY
+#   define LTDL_LAZY_OR_NOW    DL_LAZY
+#  else
+#   ifdef RTLD_NOW
+#    define LTDL_LAZY_OR_NOW   RTLD_NOW
+#   else
+#    ifdef DL_NOW
+#     define LTDL_LAZY_OR_NOW  DL_NOW
+#    else
+#     define LTDL_LAZY_OR_NOW  0
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+void fnord() { int i=42; }
+int main() {
+    void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+    if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+               if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:1964: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+  lt_cv_dlopen_self=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  lt_cv_dlopen_self=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self" 1>&6
+
+  if test "$lt_cv_dlopen_self" = yes; then
+    LDFLAGS="$LDFLAGS $link_static_flag"
+  echo $ac_n "checking whether a statically linked program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:1983: checking whether a statically linked program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self_static+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$cross_compiling" = yes; then
+    lt_cv_dlopen_self_static=cross
+  else
+    cat > conftest.$ac_ext <<EOF
+#line 1991 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL   RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+#  define LTDL_GLOBAL  DL_GLOBAL
+# else
+#  define LTDL_GLOBAL  0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+#  define LTDL_LAZY_OR_NOW     RTLD_LAZY
+# else
+#  ifdef DL_LAZY
+#   define LTDL_LAZY_OR_NOW    DL_LAZY
+#  else
+#   ifdef RTLD_NOW
+#    define LTDL_LAZY_OR_NOW   RTLD_NOW
+#   else
+#    ifdef DL_NOW
+#     define LTDL_LAZY_OR_NOW  DL_NOW
+#    else
+#     define LTDL_LAZY_OR_NOW  0
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+void fnord() { int i=42; }
+int main() {
+    void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+    if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+    if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:2038: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+  lt_cv_dlopen_self_static=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  lt_cv_dlopen_self_static=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self_static" 1>&6
+fi
+    ;;
+  esac
+
+  case $lt_cv_dlopen_self in
+  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+  *) enable_dlopen_self=unknown ;;
+  esac
+
+  case $lt_cv_dlopen_self_static in
+  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+  *) enable_dlopen_self_static=unknown ;;
+  esac
+fi
+
+# Copy echo and quote the copy, instead of the original, because it is
+# used later.
+ltecho="$echo"
+if test "X$ltecho" = "X$CONFIG_SHELL $0 --fallback-echo"; then
+   ltecho="$CONFIG_SHELL \$0 --fallback-echo"
+fi
+LTSHELL="$SHELL"
+
+LTCONFIG_VERSION="$VERSION"
+
+# Only quote variables if we're using ltmain.sh.
+case $ltmain in
+*.sh)
+  # Now quote all the things that may contain metacharacters.
+  for var in ltecho old_AR old_AR_FLAGS old_CC old_LTCC old_CFLAGS old_CPPFLAGS \
+    old_MAGIC_CMD old_LD old_LDFLAGS old_LIBS \
+    old_LN_S old_NM old_RANLIB old_STRIP \
+    old_AS old_DLLTOOL old_OBJDUMP \
+    old_OBJEXT old_EXEEXT old_reload_flag \
+    old_deplibs_check_method old_file_magic_cmd \
+    AR AR_FLAGS CC LTCC LD LN_S NM LTSHELL LTCONFIG_VERSION \
+    reload_flag reload_cmds wl \
+    pic_flag link_static_flag no_builtin_flag export_dynamic_flag_spec \
+    thread_safe_flag_spec whole_archive_flag_spec libname_spec \
+    library_names_spec soname_spec \
+    RANLIB old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
+    old_postuninstall_cmds archive_cmds archive_expsym_cmds postinstall_cmds \
+    postuninstall_cmds extract_expsyms_cmds old_archive_from_expsyms_cmds \
+    predep_objects postdep_objects predeps postdeps compiler_lib_search_path \
+    old_striplib striplib file_magic_cmd export_symbols_cmds \
+    deplibs_check_method allow_undefined_flag no_undefined_flag \
+    finish_cmds finish_eval global_symbol_pipe global_symbol_to_cdecl \
+    hardcode_libdir_flag_spec hardcode_libdir_separator  \
+    sys_lib_search_path_spec sys_lib_dlsearch_path_spec \
+    compiler_c_o need_locks exclude_expsyms include_expsyms; do
+
+    case $var in
+    reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
+    old_postinstall_cmds | old_postuninstall_cmds | \
+    export_symbols_cmds | archive_cmds | archive_expsym_cmds | \
+    extract_expsyms_cmds | old_archive_from_expsyms_cmds | \
+    postinstall_cmds | postuninstall_cmds | \
+    finish_cmds | sys_lib_search_path_spec | sys_lib_dlsearch_path_spec)
+      # Double-quote double-evaled strings.
+      eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\" -e \"\$delay_variable_subst\"\`\\\"" ### testsuite: skip nested quoting test
+      ;;
+    *)
+      eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`\\\"" ### testsuite: skip nested quoting test
+      ;;
+    esac
+  done
+
+  case $ltecho in
+  *'\$0 --fallback-echo"')
+    ltecho=`$echo "X$ltecho" | $Xsed -e 's/\\\\\\\$0 --fallback-echo"$/$0 --fallback-echo"/'`
+    ;;
+  esac
+
+  if test -z "$tagname"; then
+    trap "$rm \"$ofile\"; exit 1" 1 2 15
+    echo "creating $ofile"
+    $rm "$ofile"
+    cat <<EOF > "$ofile"
+#! $SHELL
+
+# `$echo "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Sed that helps us avoid accidentally triggering echo(1) options like -n.
+Xsed="sed -e s/^X//"
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X\${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+# The names of the tagged configurations supported by this script.
+available_tags=
+
+### BEGIN LIBTOOL CONFIG
+EOF
+  else
+    echo "appending configuration tag \"$tagname\" to $ofile"
+    echo "### BEGIN LIBTOOL TAG CONFIG: $tagname" >> "$ofile"
+  fi
+  cfgfile="$ofile"
+  ;;
+
+*)
+  # Double-quote the variables that need it (for aesthetics).
+  for var in old_AR old_AR_FLAGS old_CC old_LTCC old_CFLAGS old_CPPFLAGS \
+    old_MAGIC_CMD old_LD old_LDFLAGS old_LIBS \
+    old_LN_S old_NM old_RANLIB old_STRIP \
+    old_AS old_DLLTOOL old_OBJDUMP \
+    old_OBJEXT old_EXEEXT old_reload_flag \
+    old_deplibs_check_method old_file_magic_cmd; do
+    eval "$var=\\\"\$var\\\""
+  done
+
+  # Just create a config file.
+  cfgfile="$ofile.cfg"
+  if test -z "$tagname"; then
+    trap "$rm \"$cfgfile\"; exit 1" 1 2 15
+    echo "creating $cfgfile"
+    $rm "$cfgfile"
+    cat <<EOF > "$cfgfile"
+# `$echo "$cfgfile" | sed 's%^.*/%%'` - Libtool configuration file.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+
+### BEGIN LIBTOOL CONFIG
+EOF
+  else
+    echo "appending to $cfgfile"
+    echo "### BEGIN LIBTOOL TAG CONFIG: $tagname" >> "$ofile"
+  fi
+  ;;
+esac
+
+cat <<EOF >> "$cfgfile"
+# Libtool was configured as follows, on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+#
+# AR=$old_AR AR_FLAGS=$old_AR_FLAGS LTCC=$old_LTCC CC=$old_CC \\
+# CFLAGS=$old_CFLAGS CPPFLAGS=$old_CPPFLAGS \\
+# MAGIC_CMD=$old_MAGIC_CMD LD=$old_LD LDFLAGS=$old_LDFLAGS LIBS=$old_LIBS \\
+# LN_S=$old_LN_S NM=$old_NM RANLIB=$old_RANLIB STRIP=$old_STRIP \\
+# AS=$old_AS DLLTOOL=$old_DLLTOOL OBJDUMP=$old_OBJDUMP \\
+# objext=$old_OBJEXT exeext=$old_EXEEXT reload_flag=$old_reload_flag \\
+# deplibs_check_method=$old_deplibs_check_method \\
+# file_magic_cmd=$old_file_magic_cmd \\
+#   $0$ltconfig_args
+#
+# Compiler and other test output produced by $progname, useful for
+# debugging $progname, is in ./config.log if it exists.
+
+# The version of $progname that generated this script.
+LTCONFIG_VERSION=$LTCONFIG_VERSION
+
+# Shell to use when invoking shell scripts.
+SHELL=$LTSHELL
+
+# Whether or not to build shared libraries.
+build_libtool_libs=$enable_shared
+
+# Whether or not to add -lc for building shared libraries.
+build_libtool_need_lc=$need_lc
+
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
+
+# Whether or not to optimize for fast installation.
+fast_install=$enable_fast_install
+
+# The host system.
+host_alias=$host_alias
+host=$host
+
+# An echo program that does not interpret backslashes.
+echo=$ltecho
+
+# The archiver.
+AR=$AR
+AR_FLAGS=$AR_FLAGS
+
+# A C compiler.
+LTCC=$LTCC
+
+# A language-specific compiler.
+CC=$CC
+
+# Is the compiler the GNU C compiler?
+with_gcc=$with_gcc
+
+# The linker used to build libraries.
+LD=$LD
+
+# Whether we need hard or soft links.
+LN_S=$LN_S
+
+# A BSD-compatible nm program.
+NM=$NM
+
+# A symbol stripping program
+STRIP=$STRIP
+
+# Used to examine libraries when file_magic_cmd begins "file"
+MAGIC_CMD=$MAGIC_CMD
+
+# Used on cygwin: DLL creation program.
+DLLTOOL="$DLLTOOL"
+
+# Used on cygwin: object dumper.
+OBJDUMP="$OBJDUMP"
+
+# Used on cygwin: assembler.
+AS="$AS"
+
+# The name of the directory that contains temporary libtool files.
+objdir=$objdir
+
+# How to create reloadable object files.
+reload_flag=$reload_flag
+reload_cmds=$reload_cmds
+
+# How to pass a linker flag through the compiler.
+wl=$wl
+
+# Object file suffix (normally "o").
+objext="$objext"
+
+# Old archive suffix (normally "a").
+libext="$libext"
+
+# Executable file suffix (normally "").
+exeext="$exeext"
+
+# Additional compiler flags for building library objects.
+pic_flag=$pic_flag
+pic_mode=$pic_mode
+
+# What is the maximum length of a command?
+max_cmd_len=$max_cmd_len
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$compiler_c_o
+
+# Must we lock files when doing compilation ?
+need_locks=$need_locks
+
+# Do we need the lib prefix for modules?
+need_lib_prefix=$need_lib_prefix
+
+# Do we need a version for libraries?
+need_version=$need_version
+
+# Whether dlopen is supported.
+dlopen_support=$enable_dlopen
+
+# Whether dlopen of programs is supported.
+dlopen_self=$enable_dlopen_self
+
+# Whether dlopen of statically linked programs is supported.
+dlopen_self_static=$enable_dlopen_self_static
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$link_static_flag
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$no_builtin_flag
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$export_dynamic_flag_spec
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$whole_archive_flag_spec
+
+# Compiler flag to generate thread-safe objects.
+thread_safe_flag_spec=$thread_safe_flag_spec
+
+# Library versioning type.
+version_type=$version_type
+
+# Format of library name prefix.
+libname_spec=$libname_spec
+
+# List of archive names.  First name is the real one, the rest are links.
+# The last name is the one that the linker finds with -lNAME.
+library_names_spec=$library_names_spec
+
+# The coded name of the library, if different from the real name.
+soname_spec=$soname_spec
+
+# Commands used to build and install an old-style archive.
+RANLIB=$RANLIB
+old_archive_cmds=$old_archive_cmds
+old_postinstall_cmds=$old_postinstall_cmds
+old_postuninstall_cmds=$old_postuninstall_cmds
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$old_archive_from_new_cmds
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$old_archive_from_expsyms_cmds
+
+# Commands used to build and install a shared archive.
+archive_cmds=$archive_cmds
+archive_expsym_cmds=$archive_expsym_cmds
+postinstall_cmds=$postinstall_cmds
+postuninstall_cmds=$postuninstall_cmds
+
+# Commands to strip libraries.
+old_striplib=$old_striplib
+striplib=$striplib
+
+# Dependencies to place before the objects being linked to create a
+# shared library.
+predep_objects=$predep_objects
+
+# Dependencies to place after the objects being linked to create a
+# shared library.
+postdep_objects=$postdep_objects
+
+# Dependencies to place before the objects being linked to create a
+# shared library.
+predeps=$predeps
+
+# Dependencies to place after the objects being linked to create a
+# shared library.
+postdeps=$postdeps
+
+# The library search path used internally by the compiler when linking
+# a shared library.
+compiler_lib_search_path=$compiler_lib_search_path
+
+# Method to check whether dependent libraries are shared objects.
+deplibs_check_method=$deplibs_check_method
+
+# Command to use when deplibs_check_method == file_magic.
+file_magic_cmd=$file_magic_cmd
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$allow_undefined_flag
+
+# Flag that forces no undefined symbols.
+no_undefined_flag=$no_undefined_flag
+
+# Commands used to finish a libtool library installation in a directory.
+finish_cmds=$finish_cmds
+
+# Same as above, but a single script fragment to be evaled but not shown.
+finish_eval=$finish_eval
+
+# Take the output of nm and produce a listing of raw symbols and C names.
+global_symbol_pipe=$global_symbol_pipe
+
+# Transform the output of nm in a proper C declaration
+global_symbol_to_cdecl=$global_symbol_to_cdecl
+
+# This is the shared library runtime path variable.
+runpath_var=$runpath_var
+
+# This is the shared library path variable.
+shlibpath_var=$shlibpath_var
+
+# Is shlibpath searched before the hard-coded library search path?
+shlibpath_overrides_runpath=$shlibpath_overrides_runpath
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action
+
+# Whether we should hardcode library paths into libraries.
+hardcode_into_libs=$hardcode_into_libs
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist.
+hardcode_libdir_flag_spec=$hardcode_libdir_flag_spec
+
+# Whether we need a single -rpath flag with a separated argument.
+hardcode_libdir_separator=$hardcode_libdir_separator
+
+# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
+# resulting binary.
+hardcode_direct=$hardcode_direct
+
+# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
+# resulting binary.
+hardcode_minus_L=$hardcode_minus_L
+
+# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
+# the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var
+
+# Variables whose values should be saved in libtool wrapper scripts and
+# restored at relink time.
+variables_saved_for_relink="$variables_saved_for_relink"
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs
+
+# Compile-time system search path for libraries
+sys_lib_search_path_spec=$sys_lib_search_path_spec
+
+# Run-time system search path for libraries
+sys_lib_dlsearch_path_spec=$sys_lib_dlsearch_path_spec
+
+# Fix the shell variable \$srcfile for the compiler.
+fix_srcfile_path="$fix_srcfile_path"
+
+# Set to yes if exported symbols are required.
+always_export_symbols=$always_export_symbols
+
+# The commands to list exported symbols.
+export_symbols_cmds=$export_symbols_cmds
+
+# The commands to extract the exported symbol list from a shared archive.
+extract_expsyms_cmds=$extract_expsyms_cmds
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$exclude_expsyms
+
+# Symbols that must always be exported.
+include_expsyms=$include_expsyms
+
+EOF
+
+if test -z "$tagname"; then
+  echo '### END LIBTOOL CONFIG' >> "$ofile"
+else
+  echo "### END LIBTOOL TAG CONFIG: $tagname" >> "$ofile"
+fi
+
+case $ltmain in
+*.sh)
+  echo >> "$ofile"
+  if test -z "$tagname"; then
+    case $host_os in
+    aix3*)
+      cat <<\EOF >> "$ofile"
+
+# AIX sometimes has problems with the GCC collect2 program.  For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+  COLLECT_NAMES=
+  export COLLECT_NAMES
+fi
+EOF
+      ;;
+    esac
+    case $host in
+    *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+      cat <<'EOF' >> "$ofile"
+      # This is a source program that is used to create dlls on Windows
+      # Don't remove nor modify the starting and closing comments
+# /* ltdll.c starts here */
+# #define WIN32_LEAN_AND_MEAN
+# #include <windows.h>
+# #undef WIN32_LEAN_AND_MEAN
+# #include <stdio.h>
+#
+# #ifndef __CYGWIN__
+# #  ifdef __CYGWIN32__
+# #    define __CYGWIN__ __CYGWIN32__
+# #  endif
+# #endif
+#
+# #ifdef __cplusplus
+# extern "C" {
+# #endif
+# BOOL APIENTRY DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved);
+# #ifdef __cplusplus
+# }
+# #endif
+#
+# #ifdef __CYGWIN__
+# #include <cygwin/cygwin_dll.h>
+# DECLARE_CYGWIN_DLL( DllMain );
+# #endif
+# HINSTANCE __hDllInstance_base;
+#
+# BOOL APIENTRY
+# DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved)
+# {
+#   __hDllInstance_base = hInst;
+#   return TRUE;
+# }
+# /* ltdll.c ends here */
+      # This is a source program that is used to create import libraries
+      # on Windows for dlls which lack them. Don't remove nor modify the
+      # starting and closing comments
+# /* impgen.c starts here */
+# /*   Copyright (C) 1999-2000 Free Software Foundation, Inc.
+#
+#  This file is part of GNU libtool.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#  */
+#
+#  #include <stdio.h>          /* for printf() */
+#  #include <unistd.h>         /* for open(), lseek(), read() */
+#  #include <fcntl.h>          /* for O_RDONLY, O_BINARY */
+#  #include <string.h>         /* for strdup() */
+#
+#  /* O_BINARY isn't required (or even defined sometimes) under Unix */
+#  #ifndef O_BINARY
+#  #define O_BINARY 0
+#  #endif
+#
+#  static unsigned int
+#  pe_get16 (fd, offset)
+#       int fd;
+#       int offset;
+#  {
+#    unsigned char b[2];
+#    lseek (fd, offset, SEEK_SET);
+#    read (fd, b, 2);
+#    return b[0] + (b[1]<<8);
+#  }
+#
+#  static unsigned int
+#  pe_get32 (fd, offset)
+#      int fd;
+#      int offset;
+#  {
+#    unsigned char b[4];
+#    lseek (fd, offset, SEEK_SET);
+#    read (fd, b, 4);
+#    return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+#  }
+#
+#  static unsigned int
+#  pe_as32 (ptr)
+#       void *ptr;
+#  {
+#    unsigned char *b = ptr;
+#    return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+#  }
+#
+#  int
+#  main (argc, argv)
+#      int argc;
+#      char *argv[];
+#  {
+#      int dll;
+#      unsigned long pe_header_offset, opthdr_ofs, num_entries, i;
+#      unsigned long export_rva, export_size, nsections, secptr, expptr;
+#      unsigned long name_rvas, nexp;
+#      unsigned char *expdata, *erva;
+#      char *filename, *dll_name;
+#
+#      filename = argv[1];
+#
+#      dll = open(filename, O_RDONLY|O_BINARY);
+#      if (dll < 1)
+#      return 1;
+#
+#      dll_name = filename;
+#
+#      for (i=0; filename[i]; i++)
+#      if (filename[i] == '/' || filename[i] == '\\'  || filename[i] == ':')
+#          dll_name = filename + i +1;
+#
+#      pe_header_offset = pe_get32 (dll, 0x3c);
+#      opthdr_ofs = pe_header_offset + 4 + 20;
+#      num_entries = pe_get32 (dll, opthdr_ofs + 92);
+#
+#      if (num_entries < 1) /* no exports */
+#      return 1;
+#
+#      export_rva = pe_get32 (dll, opthdr_ofs + 96);
+#      export_size = pe_get32 (dll, opthdr_ofs + 100);
+#      nsections = pe_get16 (dll, pe_header_offset + 4 +2);
+#      secptr = (pe_header_offset + 4 + 20 +
+#            pe_get16 (dll, pe_header_offset + 4 + 16));
+#
+#      expptr = 0;
+#      for (i = 0; i < nsections; i++)
+#      {
+#      char sname[8];
+#      unsigned long secptr1 = secptr + 40 * i;
+#      unsigned long vaddr = pe_get32 (dll, secptr1 + 12);
+#      unsigned long vsize = pe_get32 (dll, secptr1 + 16);
+#      unsigned long fptr = pe_get32 (dll, secptr1 + 20);
+#      lseek(dll, secptr1, SEEK_SET);
+#      read(dll, sname, 8);
+#      if (vaddr <= export_rva && vaddr+vsize > export_rva)
+#      {
+#          expptr = fptr + (export_rva - vaddr);
+#          if (export_rva + export_size > vaddr + vsize)
+#              export_size = vsize - (export_rva - vaddr);
+#          break;
+#      }
+#      }
+#
+#      expdata = (unsigned char*)malloc(export_size);
+#      lseek (dll, expptr, SEEK_SET);
+#      read (dll, expdata, export_size);
+#      erva = expdata - export_rva;
+#
+#      nexp = pe_as32 (expdata+24);
+#      name_rvas = pe_as32 (expdata+32);
+#
+#      printf ("EXPORTS\n");
+#      for (i = 0; i<nexp; i++)
+#      {
+#      unsigned long name_rva = pe_as32 (erva+name_rvas+i*4);
+#      printf ("\t%s @ %ld ;\n", erva+name_rva, 1+ i);
+#      }
+#
+#      return 0;
+#  }
+# /* impgen.c ends here */
+
+EOF
+    ;;
+  esac
+
+
+    # Append the ltmain.sh script.
+    sed '$q' "$ltmain" >> "$ofile" || (rm -f "$ofile"; exit 1)
+    # We use sed instead of cat because bash on DJGPP gets confused if
+    # if finds mixed CR/LF and LF-only lines.  Since sed operates in
+    # text mode, it properly converts lines to CR/LF.  This bash problem
+    # is reportedly fixed, but why not run on old versions too?
+
+    chmod +x "$ofile"
+  fi
+  ;;
+
+*)
+  # Compile the libtool program.
+  echo "FIXME: would compile $ltmain"
+  ;;
+esac
+
+# Update the list of available tags.
+if test -n "$tagname"; then
+
+  # Extract list of available tagged configurations in $ofile.
+  # Note that this assumes the entire list is on one line.
+  available_tags=`grep "^available_tags=" $ofile | sed -e 's/available_tags=\(.*$\)/\1/' -e 's/\"//g'`
+
+  # Append the new tag name to the list of available tags.
+  available_tags="$available_tags $tagname"
+
+  # Now substitute the updated of available tags.
+  if eval "sed -e 's/^available_tags=.*\$/available_tags=\"$available_tags\"/' ${ofile} > ${ofile}.new"; then
+    mv ${ofile}.new ${ofile}
+    chmod +x "$ofile"
+  else
+    rm -f ${ofile}.new
+    echo "$progname: unable to update list of available tagged configurations."
+    exit 1
+  fi
+fi
+
+# Don't cache tagged configuration!
+test -n "$cache_file" && test -z "$tagname" || exit 0
+
+# AC_CACHE_SAVE
+trap '' 1 2 15
+cat > confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >> confcache
+if cmp -s $cache_file confcache; then
+  :
+else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache > $cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+exit 0
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
diff --git a/config/ltmain.sh b/config/ltmain.sh

new file mode 100644 (file)

index 0000000..25e0cf9
--- /dev/null
+++ b/config/ltmain.sh
@@ -0,0 +1,5463 @@
+# ltmain.sh - Provide generalized library-building support services.
+# NOTE: Changing this file will not affect anything until you rerun ltconfig.
+#
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001
+# Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Check that we have a working $echo.
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell, and then maybe $echo will work.
+  exec $SHELL "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# The name of this program.
+progname=`$echo "$0" | sed 's%^.*/%%'`
+modename="$progname"
+
+# Constants.
+PROGRAM=ltmain.sh
+PACKAGE=libtool
+VERSION=1.4a
+TIMESTAMP=" (1.641.2.255 2001/05/22 10:39:30)"
+
+default_mode=
+help="Try \`$progname --help' for more information."
+magic="%%%MAGIC variable%%%"
+mkdir="mkdir"
+mv="mv -f"
+rm="rm -f"
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='s/\([\\`\\"$\\\\]\)/\\\1/g'
+SP2NL='tr \040 \012'
+NL2SP='tr \015\012 \040\040'
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+# We save the old values to restore during execute mode.
+if test "${LC_ALL+set}" = set; then
+  save_LC_ALL="$LC_ALL"; LC_ALL=C; export LC_ALL
+fi
+if test "${LANG+set}" = set; then
+  save_LANG="$LANG"; LANG=C; export LANG
+fi
+
+if test "$LTCONFIG_VERSION" != "$VERSION"; then
+  echo "$modename: ltconfig version \`$LTCONFIG_VERSION' does not match $PROGRAM version \`$VERSION'" 1>&2
+  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+  exit 1
+fi
+
+if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
+  echo "$modename: not configured to build any kind of library" 1>&2
+  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+  exit 1
+fi
+
+# Global variables.
+mode=$default_mode
+nonopt=
+prev=
+prevopt=
+run=
+show="$echo"
+show_help=
+execute_dlfiles=
+lo2o="s/\\.lo\$/.${objext}/"
+o2lo="s/\\.${objext}\$/.lo/"
+
+# Parse our command line options once, thoroughly.
+while test $# -gt 0
+do
+  arg="$1"
+  shift
+
+  case $arg in
+  -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;;
+  *) optarg= ;;
+  esac
+
+  # If the previous option needs an argument, assign it.
+  if test -n "$prev"; then
+    case $prev in
+    execute_dlfiles)
+      execute_dlfiles="$execute_dlfiles $arg"
+      ;;
+    tag)
+      tagname="$arg"
+
+      # Check whether tagname contains only valid characters
+      case $tagname in
+      *[!-_A-Za-z0-9,/]*)
+       echo "$progname: invalid tag name: $tagname" 1>&2
+       exit 1
+        ;;
+      esac
+
+      case $tagname in
+      CC)
+       # Don't test for the "default" C tag, as we know, it's there, but
+       # not specially marked.
+       ;;
+      *)
+        if grep "^### BEGIN LIBTOOL TAG CONFIG: $tagname$" < "$0" > /dev/null; then
+          taglist="$taglist $tagname"
+         # Evaluate the configuration.
+         eval "`sed -n -e '/^### BEGIN LIBTOOL TAG CONFIG: '$tagname'$/,/^### END LIBTOOL TAG CONFIG: '$tagname'$/p' < $0`"
+        else
+         echo "$progname: ignoring unknown tag $tagname" 1>&2
+        fi
+        ;;
+      esac
+      ;;
+    *)
+      eval "$prev=\$arg"
+      ;;
+    esac
+
+    prev=
+    prevopt=
+    continue
+  fi
+
+  # Have we seen a non-optional argument yet?
+  case $arg in
+  --help)
+    show_help=yes
+    ;;
+
+  --version)
+    echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"
+    exit 0
+    ;;
+
+  --config)
+    sed -n -e '/^### BEGIN LIBTOOL CONFIG/,/^### END LIBTOOL CONFIG/p' < "$0"
+    # Now print the configurations for the tags.
+    for tagname in $taglist; do
+      sed -n -e "/^### BEGIN LIBTOOL TAG CONFIG: $tagname$/,/^### END LIBTOOL TAG CONFIG: $tagname$/p" < "$0"
+    done
+    exit 0
+    ;;
+
+  --debug)
+    echo "$progname: enabling shell trace mode"
+    set -x
+    ;;
+
+  --dry-run | -n)
+    run=:
+    ;;
+
+  --features)
+    echo "host: $host"
+    if test "$build_libtool_libs" = yes; then
+      echo "enable shared libraries"
+    else
+      echo "disable shared libraries"
+    fi
+    if test "$build_old_libs" = yes; then
+      echo "enable static libraries"
+    else
+      echo "disable static libraries"
+    fi
+    exit 0
+    ;;
+
+  --finish) mode="finish" ;;
+
+  --mode) prevopt="--mode" prev=mode ;;
+  --mode=*) mode="$optarg" ;;
+
+  --quiet | --silent)
+    show=:
+    ;;
+
+  --tag) prevopt="--tag" prev=tag ;;
+  --tag=*)
+    set tag "$optarg" ${1+"$@"}
+    shift
+    prev=tag
+    ;;
+
+  -dlopen)
+    prevopt="-dlopen"
+    prev=execute_dlfiles
+    ;;
+
+  -*)
+    $echo "$modename: unrecognized option \`$arg'" 1>&2
+    $echo "$help" 1>&2
+    exit 1
+    ;;
+
+  *)
+    nonopt="$arg"
+    break
+    ;;
+  esac
+done
+
+if test -n "$prevopt"; then
+  $echo "$modename: option \`$prevopt' requires an argument" 1>&2
+  $echo "$help" 1>&2
+  exit 1
+fi
+
+# If this variable is set in any of the actions, the command in it
+# will be execed at the end.  This prevents here-documents from being
+# left over by shells.
+exec_cmd=
+
+if test -z "$show_help"; then
+
+  # Infer the operation mode.
+  if test -z "$mode"; then
+    case $nonopt in
+    *cc | *++ | gcc* | *-gcc*)
+      mode=link
+      for arg
+      do
+       case $arg in
+       -c)
+          mode=compile
+          break
+          ;;
+       esac
+      done
+      ;;
+    *db | *dbx | *strace | *truss)
+      mode=execute
+      ;;
+    *install*|cp|mv)
+      mode=install
+      ;;
+    *rm)
+      mode=uninstall
+      ;;
+    *)
+      # If we have no mode, but dlfiles were specified, then do execute mode.
+      test -n "$execute_dlfiles" && mode=execute
+
+      # Just use the default operation mode.
+      if test -z "$mode"; then
+       if test -n "$nonopt"; then
+         $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
+       else
+         $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
+       fi
+      fi
+      ;;
+    esac
+  fi
+
+  # Only execute mode is allowed to have -dlopen flags.
+  if test -n "$execute_dlfiles" && test "$mode" != execute; then
+    $echo "$modename: unrecognized option \`-dlopen'" 1>&2
+    $echo "$help" 1>&2
+    exit 1
+  fi
+
+  # Change the help message to a mode-specific one.
+  generic_help="$help"
+  help="Try \`$modename --help --mode=$mode' for more information."
+
+  # These modes are in order of execution frequency so that they run quickly.
+  case $mode in
+  # libtool compile mode
+  compile)
+    modename="$modename: compile"
+    # Get the compilation command and the source file.
+    base_compile=
+    prev=
+    lastarg=
+    srcfile="$nonopt"
+    suppress_output=
+
+    user_target=no
+    for arg
+    do
+      case $prev in
+      "") ;;
+      xcompiler)
+       # Aesthetically quote the previous argument.
+       prev=
+       lastarg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+
+       case $arg in
+       # Double-quote args containing other shell metacharacters.
+       # Many Bourne shells cannot handle close brackets correctly
+       # in scan sets, so we specify it separately.
+       *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \   ]*|*]*|"")
+         arg="\"$arg\""
+         ;;
+       esac
+
+       # Add the previous argument to base_compile.
+       if test -z "$base_compile"; then
+         base_compile="$lastarg"
+       else
+         base_compile="$base_compile $lastarg"
+       fi
+       continue
+       ;;
+      esac
+
+      # Accept any command-line options.
+      case $arg in
+      -o)
+       if test "$user_target" != "no"; then
+         $echo "$modename: you cannot specify \`-o' more than once" 1>&2
+         exit 1
+       fi
+       user_target=next
+       ;;
+
+      -static)
+       build_old_libs=yes
+       continue
+       ;;
+
+      -prefer-pic)
+       pic_mode=yes
+       continue
+       ;;
+
+      -prefer-non-pic)
+       pic_mode=no
+       continue
+       ;;
+
+      -Xcompiler)
+       prev=xcompiler
+       continue
+       ;;
+
+      -Wc,*)
+       args=`$echo "X$arg" | $Xsed -e "s/^-Wc,//"`
+       lastarg=
+       IFS="${IFS=     }"; save_ifs="$IFS"; IFS=','
+       for arg in $args; do
+         IFS="$save_ifs"
+
+         # Double-quote args containing other shell metacharacters.
+         # Many Bourne shells cannot handle close brackets correctly
+         # in scan sets, so we specify it separately.
+         case $arg in
+           *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \       ]*|*]*|"")
+           arg="\"$arg\""
+           ;;
+         esac
+         lastarg="$lastarg $arg"
+       done
+       IFS="$save_ifs"
+       lastarg=`$echo "X$lastarg" | $Xsed -e "s/^ //"`
+
+       # Add the arguments to base_compile.
+       if test -z "$base_compile"; then
+         base_compile="$lastarg"
+       else
+         base_compile="$base_compile $lastarg"
+       fi
+       continue
+       ;;
+      esac
+
+      case $user_target in
+      next)
+       # The next one is the -o target name
+       user_target=yes
+       continue
+       ;;
+      yes)
+       # We got the output file
+       user_target=set
+       libobj="$arg"
+       continue
+       ;;
+      esac
+
+      # Accept the current argument as the source file.
+      lastarg="$srcfile"
+      srcfile="$arg"
+
+      # Aesthetically quote the previous argument.
+
+      # Backslashify any backslashes, double quotes, and dollar signs.
+      # These are the only characters that are still specially
+      # interpreted inside of double-quoted scrings.
+      lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"`
+
+      # Double-quote args containing other shell metacharacters.
+      # Many Bourne shells cannot handle close brackets correctly
+      # in scan sets, so we specify it separately.
+      case $lastarg in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \    ]*|*]*|"")
+       lastarg="\"$lastarg\""
+       ;;
+      esac
+
+      # Add the previous argument to base_compile.
+      if test -z "$base_compile"; then
+       base_compile="$lastarg"
+      else
+       base_compile="$base_compile $lastarg"
+      fi
+    done
+
+    case $user_target in
+    set)
+      ;;
+    no)
+      # Get the name of the library object.
+      libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
+      ;;
+    *)
+      $echo "$modename: you must specify a target with \`-o'" 1>&2
+      exit 1
+      ;;
+    esac
+
+    # Recognize several different file suffixes.
+    # If the user specifies -o file.o, it is replaced with file.lo
+    xform='[cCFSfmso]'
+    case $libobj in
+    *.ada) xform=ada ;;
+    *.adb) xform=adb ;;
+    *.ads) xform=ads ;;
+    *.asm) xform=asm ;;
+    *.c++) xform=c++ ;;
+    *.cc) xform=cc ;;
+    *.class) xform=class ;;
+    *.cpp) xform=cpp ;;
+    *.cxx) xform=cxx ;;
+    *.f90) xform=f90 ;;
+    *.for) xform=for ;;
+    *.java) xform=java ;;
+    esac
+
+    libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"`
+
+    case $libobj in
+    *.lo) obj=`$echo "X$libobj" | $Xsed -e "$lo2o"` ;;
+    *)
+      $echo "$modename: cannot determine name of library object from \`$libobj'" 1>&2
+      exit 1
+      ;;
+    esac
+
+    # Infer tagged configuration to use if any are available and
+    # if one wasn't chosen via the "--tag" command line option.
+    # Only attempt this if the compiler in the base compile
+    # command doesn't match the default compiler.
+    if test -n "$available_tags" && test -z "$tagname"; then
+      case $base_compile in
+      "$CC "*) ;;
+      # Blanks in the command may have been stripped by the calling shell,
+      # but not from the CC environment variable when ltconfig was run.
+      "`$echo $CC` "*) ;;
+      *)
+        for z in $available_tags; do
+          if grep "^### BEGIN LIBTOOL TAG CONFIG: $z$" < "$0" > /dev/null; then
+           # Evaluate the configuration.
+           eval "`sed -n -e '/^### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^### END LIBTOOL TAG CONFIG: '$z'$/p' < $0`"
+            case $base_compile in
+           "$CC "*)
+              # The compiler in the base compile command matches
+              # the one in the tagged configuration.
+              # Assume this is the tagged configuration we want.
+              tagname=$z
+              break
+              ;;
+           "`$echo $CC` "*)
+             tagname=$z
+             break
+             ;;
+           esac
+          fi
+        done
+        # If $tagname still isn't set, then no tagged configuration
+        # was found and let the user know that the "--tag" command
+        # line option must be used.
+        if test -z "$tagname"; then
+          echo "$modename: unable to infer tagged configuration"
+          echo "$modename: specify a tag with \`--tag'" 1>&2
+         exit 1
+#        else
+#          echo "$modename: using $tagname tagged configuration"
+        fi
+       ;;
+      esac
+    fi
+
+    objname=`$echo "X$obj" | $Xsed -e 's%^.*/%%'`
+    xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+    if test "X$xdir" = "X$obj"; then
+      xdir=
+    else
+      xdir=$xdir/
+    fi
+    lobj=${xdir}$objdir/$objname
+
+    if test -z "$base_compile"; then
+      $echo "$modename: you must specify a compilation command" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Delete any leftover library objects.
+    if test "$build_old_libs" = yes; then
+      removelist="$obj $lobj $libobj ${libobj}T"
+    else
+      removelist="$lobj $libobj ${libobj}T"
+    fi
+
+    $run $rm $removelist
+    trap "$run $rm $removelist; exit 1" 1 2 15
+
+    # On Cygwin there's no "real" PIC flag so we must build both object types
+    case $host_os in
+    cygwin* | mingw* | pw32* | os2*)
+      pic_mode=default
+      ;;
+    esac
+    if test $pic_mode = no && test "$deplibs_check_method" != pass_all; then
+      # non-PIC code in shared libraries is not supported
+      pic_mode=default
+    fi
+
+    # Calculate the filename of the output object if compiler does
+    # not support -o with -c
+    if test "$compiler_c_o" = no; then
+      output_obj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.${objext}
+      lockfile="$output_obj.lock"
+      removelist="$removelist $output_obj $lockfile"
+      trap "$run $rm $removelist; exit 1" 1 2 15
+    else
+      output_obj=
+      need_locks=no
+      lockfile=
+    fi
+
+    # Lock this critical section if it is needed
+    # We use this script file to make the link, it avoids creating a new file
+    if test "$need_locks" = yes; then
+      until $run ln "$0" "$lockfile" 2>/dev/null; do
+       $show "Waiting for $lockfile to be removed"
+       sleep 2
+      done
+    elif test "$need_locks" = warn; then
+      if test -f "$lockfile"; then
+       echo "\
+*** ERROR, $lockfile exists and contains:
+`cat $lockfile 2>/dev/null`
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+       $run $rm $removelist
+       exit 1
+      fi
+      echo $srcfile > "$lockfile"
+    fi
+
+    if test -n "$fix_srcfile_path"; then
+      eval srcfile=\"$fix_srcfile_path\"
+    fi
+
+    $run $rm "$libobj" "${libobj}T"
+
+    # Create a libtool object file (analogous to a ".la" file),
+    # but don't create it if we're doing a dry run.
+    test -z "$run" && cat > ${libobj}T <<EOF
+# $libobj - a libtool object file
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# Name of the PIC object.
+EOF
+
+    # Only build a PIC object if we are building libtool libraries.
+    if test "$build_libtool_libs" = yes; then
+      # Without this assignment, base_compile gets emptied.
+      fbsd_hideous_sh_bug=$base_compile
+
+      if test "$pic_mode" != no; then
+       command="$base_compile $srcfile $pic_flag"
+      else
+       # Don't build PIC code
+       command="$base_compile $srcfile"
+      fi
+
+      if test ! -d ${xdir}$objdir; then
+       $show "$mkdir ${xdir}$objdir"
+       $run $mkdir ${xdir}$objdir
+       status=$?
+       if test $status -ne 0 && test ! -d ${xdir}$objdir; then
+         exit $status
+        fi
+      fi 
+
+      if test -z "$output_obj"; then
+        # Place PIC objects in $objdir
+        command="$command -o $lobj"
+      fi
+
+      $run $rm "$lobj" "$output_obj"
+
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+       test -n "$output_obj" && $run $rm $removelist
+       exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+        test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+       echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+       $run $rm $removelist
+       exit 1
+      fi
+
+      # Just move the object if needed, then go on to compile the next one
+      if test -n "$output_obj" && test "x$output_obj" != "x$lobj"; then
+       $show "$mv $output_obj $lobj"
+       if $run $mv $output_obj $lobj; then :
+       else
+         error=$?
+         $run $rm $removelist
+         exit $error
+       fi
+      fi
+
+      # Append the name of the PIC object to the libtool object file.
+      test -z "$run" && cat >> ${libobj}T <<EOF
+pic_object='$objdir/$objname'
+
+EOF
+
+      # Allow error messages only from the first compilation.
+      suppress_output=' >/dev/null 2>&1'
+    else
+      # No PIC object so indicate it doesn't exist in the libtool
+      # object file.
+      test -z "$run" && cat >> ${libobj}T <<EOF
+pic_object=none
+
+EOF
+    fi
+
+    # Only build a position-dependent object if we build old libraries.
+    if test "$build_old_libs" = yes; then
+      if test "$pic_mode" != yes; then
+       # Don't build PIC code
+       command="$base_compile $srcfile"
+      else
+       command="$base_compile $srcfile $pic_flag"
+      fi
+      if test "$compiler_c_o" = yes; then
+       command="$command -o $obj"
+      fi
+
+      # Suppress compiler output if we already did a PIC compilation.
+      command="$command$suppress_output"
+      $run $rm "$obj" "$output_obj"
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+       $run $rm $removelist
+       exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+        test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+       echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+       $run $rm $removelist
+       exit 1
+      fi
+
+      # Just move the object if needed
+      if test -n "$output_obj" && test "x$output_obj" != "x$obj"; then
+       $show "$mv $output_obj $obj"
+       if $run $mv $output_obj $obj; then :
+       else
+         error=$?
+         $run $rm $removelist
+         exit $error
+       fi
+      fi
+
+      # Append the name of the non-PIC object the libtool object file.
+      # Only append if the libtool object file exists.
+      test -z "$run" && cat >> ${libobj}T <<EOF
+# Name of the non-PIC object.
+non_pic_object='$objname'
+
+EOF
+    else
+      # Append the name of the non-PIC object the libtool object file.
+      # Only append if the libtool object file exists.
+      test -z "$run" && cat >> ${libobj}T <<EOF
+# Name of the non-PIC object.
+non_pic_object=none
+
+EOF
+    fi
+
+    $run $mv "${libobj}T" "${libobj}"
+
+    # Unlock the critical section if it was locked
+    if test "$need_locks" != no; then
+      $run $rm "$lockfile"
+    fi
+
+    exit 0
+    ;;
+
+  # libtool link mode
+  link | relink)
+    modename="$modename: link"
+    case $host in
+    *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+      # It is impossible to link a dll without this setting, and
+      # we shouldn't force the makefile maintainer to figure out
+      # which system we are compiling for in order to pass an extra
+      # flag for every libtool invokation.
+      # allow_undefined=no
+
+      # FIXME: Unfortunately, there are problems with the above when trying
+      # to make a dll which has undefined symbols, in which case not
+      # even a static library is built.  For now, we need to specify
+      # -no-undefined on the libtool link line when we can be certain
+      # that all symbols are satisfied, otherwise we get a static library.
+      allow_undefined=yes
+      ;;
+    *)
+      allow_undefined=yes
+      ;;
+    esac
+    libtool_args="$nonopt"
+    base_compile="$nonopt"
+    compile_command="$nonopt"
+    finalize_command="$nonopt"
+
+    compile_rpath=
+    finalize_rpath=
+    compile_shlibpath=
+    finalize_shlibpath=
+    convenience=
+    old_convenience=
+    deplibs=
+    old_deplibs=
+    compiler_flags=
+    linker_flags=
+    dllsearchpath=
+    lib_search_path=`pwd`
+
+    avoid_version=no
+    dlfiles=
+    dlprefiles=
+    dlself=no
+    export_dynamic=no
+    export_symbols=
+    export_symbols_regex=
+    generated=
+    libobjs=
+    ltlibs=
+    module=no
+    no_install=no
+    objs=
+    non_pic_objects=
+    prefer_static_libs=no
+    preload=no
+    prev=
+    prevarg=
+    release=
+    rpath=
+    xrpath=
+    perm_rpath=
+    temp_rpath=
+    thread_safe=no
+    vinfo=
+
+    # We need to know -static, to get the right output filenames.
+    for arg
+    do
+      case $arg in
+      -all-static | -static)
+       if test "X$arg" = "X-all-static"; then
+         if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
+           $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2
+         fi
+         if test -n "$link_static_flag"; then
+           dlopen_self=$dlopen_self_static
+         fi
+       else
+         if test -z "$pic_flag" && test -n "$link_static_flag"; then
+           dlopen_self=$dlopen_self_static
+         fi
+       fi
+       build_libtool_libs=no
+       build_old_libs=yes
+       prefer_static_libs=yes
+       break
+       ;;
+      esac
+    done
+
+    # See if our shared archives depend on static archives.
+    test -n "$old_archive_from_new_cmds" && build_old_libs=yes
+
+    # Go through the arguments, transforming them on the way.
+    while test $# -gt 0; do
+      arg="$1"
+      base_compile="$base_compile $arg"
+      shift
+      case $arg in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \    ]*|*]*|"")
+       qarg=\"`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`\" ### testsuite: skip nested quoting test
+       ;;
+      *) qarg=$arg ;;
+      esac
+      libtool_args="$libtool_args $qarg"
+
+      # If the previous option needs an argument, assign it.
+      if test -n "$prev"; then
+       case $prev in
+       output)
+         compile_command="$compile_command @OUTPUT@"
+         finalize_command="$finalize_command @OUTPUT@"
+         ;;
+       esac
+
+       case $prev in
+       dlfiles|dlprefiles)
+         if test "$preload" = no; then
+           # Add the symbol object into the linking commands.
+           compile_command="$compile_command @SYMFILE@"
+           finalize_command="$finalize_command @SYMFILE@"
+           preload=yes
+         fi
+         case $arg in
+         *.la | *.lo) ;;  # We handle these cases below.
+         force)
+           if test "$dlself" = no; then
+             dlself=needless
+             export_dynamic=yes
+           fi
+           prev=
+           continue
+           ;;
+         self)
+           if test "$prev" = dlprefiles; then
+             dlself=yes
+           elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then
+             dlself=yes
+           else
+             dlself=needless
+             export_dynamic=yes
+           fi
+           prev=
+           continue
+           ;;
+         *)
+           if test "$prev" = dlfiles; then
+             dlfiles="$dlfiles $arg"
+           else
+             dlprefiles="$dlprefiles $arg"
+           fi
+           prev=
+           continue
+           ;;
+         esac
+         ;;
+       expsyms)
+         export_symbols="$arg"
+         if test ! -f "$arg"; then
+           $echo "$modename: symbol file \`$arg' does not exist"
+           exit 1
+         fi
+         prev=
+         continue
+         ;;
+       expsyms_regex)
+         export_symbols_regex="$arg"
+         prev=
+         continue
+         ;;
+       release)
+         release="-$arg"
+         prev=
+         continue
+         ;;
+       objectlist)
+         if test -f "$arg"; then
+            save_arg=$arg
+           moreargs=
+           for fil in `cat $save_arg`
+           do
+#            moreargs="$moreargs $fil"
+              arg=$fil
+              # A libtool-controlled object.
+
+             # Check to see that this really is a libtool object.
+             if (sed -e '2q' $arg | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+                pic_object=
+                non_pic_object=
+
+                # Read the .lo file
+                # If there is no directory component, then add one.
+                case $arg in
+                */* | *\\*) . $arg ;;
+                *) . ./$arg ;;
+                esac
+
+                if test -z "$pic_object" || \
+                   test -z "$non_pic_object" ||
+                   test "$pic_object" = none && \
+                   test "$non_pic_object" = none; then
+                  $echo "$modename: cannot find name of object for \`$arg'" 1>&2
+                  exit 1
+                fi
+
+               # Extract subdirectory from the argument.
+               xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+               if test "X$xdir" = "X$arg"; then
+                 xdir=
+               else
+                 xdir="$xdir/"
+               fi
+
+                if test "$pic_object" != none; then
+                  # Prepend the subdirectory the object is found in.
+                 pic_object="$xdir$pic_object"
+
+                 if test "$prev" = dlfiles; then
+                   if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+                     dlfiles="$dlfiles $pic_object"
+                     prev=
+                     continue
+                   else
+                     # If libtool objects are unsupported, then we need to preload.
+                     prev=dlprefiles
+                   fi
+                 fi
+
+                 # CHECK ME:  I think I busted this.  -Ossama
+                  if test "$prev" = dlprefiles; then
+                   # Preload the old-style object.
+                   dlprefiles="$dlprefiles $pic_object"
+                   prev=
+                  fi
+
+                  # A PIC object.
+                 libobjs="$libobjs $pic_object"
+                 arg="$pic_object"
+                fi
+
+                # Non-PIC object.
+                if test "$non_pic_object" != none; then
+                  # Prepend the subdirectory the object is found in.
+                 non_pic_object="$xdir$non_pic_object"
+
+                  # A standard non-PIC object
+                  non_pic_objects="$non_pic_objects $non_pic_object"
+                  if test -z "$pic_object" || test "$pic_object" = none ; then
+                    arg="$non_pic_object"
+                  fi
+                fi
+              else
+                # Only an error if not doing a dry-run.
+                if test -z "$run"; then
+                  $echo "$modename: \`$arg' is not a valid libtool object" 1>&2
+                  exit 1
+                else
+                  # Dry-run case.
+
+                 # Extract subdirectory from the argument.
+                 xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+                 if test "X$xdir" = "X$arg"; then
+                   xdir=
+                 else
+                   xdir="$xdir/"
+                 fi
+
+                  pic_object=`$echo "X${xdir}${objdir}/${arg}" | $Xsed -e "$lo2o"`
+                  non_pic_object=`$echo "X${xdir}${arg}" | $Xsed -e "$lo2o"`
+                 libobjs="$libobjs $pic_object"
+                  non_pic_objects="$non_pic_objects $non_pic_object"
+                fi
+             fi
+           done
+         else
+           $echo "$modename: link input file \`$save_arg' does not exist"
+           exit 1
+         fi
+          arg=$save_arg
+         prev=
+         continue
+         ;;
+       rpath | xrpath)
+         # We need an absolute path.
+         case $arg in
+         [\\/]* | [A-Za-z]:[\\/]*) ;;
+         *)
+           $echo "$modename: only absolute run-paths are allowed" 1>&2
+           exit 1
+           ;;
+         esac
+         if test "$prev" = rpath; then
+           case "$rpath " in
+           *" $arg "*) ;;
+           *) rpath="$rpath $arg" ;;
+           esac
+         else
+           case "$xrpath " in
+           *" $arg "*) ;;
+           *) xrpath="$xrpath $arg" ;;
+           esac
+         fi
+         prev=
+         continue
+         ;;
+       xcompiler)
+         compiler_flags="$compiler_flags $qarg"
+         prev=
+         compile_command="$compile_command $qarg"
+         finalize_command="$finalize_command $qarg"
+         continue
+         ;;
+       xlinker)
+         linker_flags="$linker_flags $qarg"
+         compiler_flags="$compiler_flags $wl$qarg"
+         prev=
+         compile_command="$compile_command $wl$qarg"
+         finalize_command="$finalize_command $wl$qarg"
+         continue
+         ;;
+       *)
+         eval "$prev=\"\$arg\""
+         prev=
+         continue
+         ;;
+       esac
+      fi # test -n $prev
+
+      prevarg="$arg"
+
+      case $arg in
+      -all-static)
+       if test -n "$link_static_flag"; then
+         compile_command="$compile_command $link_static_flag"
+         finalize_command="$finalize_command $link_static_flag"
+       fi
+       continue
+       ;;
+
+      -allow-undefined)
+       # FIXME: remove this flag sometime in the future.
+       $echo "$modename: \`-allow-undefined' is deprecated because it is the default" 1>&2
+       continue
+       ;;
+
+      -avoid-version)
+       avoid_version=yes
+       continue
+       ;;
+
+      -dlopen)
+       prev=dlfiles
+       continue
+       ;;
+
+      -dlpreopen)
+       prev=dlprefiles
+       continue
+       ;;
+
+      -export-dynamic)
+       export_dynamic=yes
+       continue
+       ;;
+
+      -export-symbols | -export-symbols-regex)
+       if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+         $echo "$modename: more than one -exported-symbols argument is not allowed"
+         exit 1
+       fi
+       if test "X$arg" = "X-export-symbols"; then
+         prev=expsyms
+       else
+         prev=expsyms_regex
+       fi
+       continue
+       ;;
+
+      # The native IRIX linker understands -LANG:*, -LIST:* and -LNO:*
+      # so, if we see these flags be careful not to treat them like -L
+      -L[A-Z][A-Z]*:*)
+       case $with_gcc/$host in
+       no/*-*-irix*)
+         compile_command="$compile_command $arg"
+         finalize_command="$finalize_command $arg"
+         ;;
+       esac
+       continue
+       ;;
+
+      -L*)
+       dir=`$echo "X$arg" | $Xsed -e 's/^-L//'`
+       # We need an absolute path.
+       case $dir in
+       [\\/]* | [A-Za-z]:[\\/]*) ;;
+       *)
+         absdir=`cd "$dir" && pwd`
+         if test -z "$absdir"; then
+           $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
+           exit 1
+         fi
+         dir="$absdir"
+         ;;
+       esac
+       case "$deplibs " in
+       *" -L$dir "*) ;;
+       *)
+         deplibs="$deplibs -L$dir"
+         lib_search_path="$lib_search_path $dir"
+         ;;
+       esac
+       case $host in
+       *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+         case :$dllsearchpath: in
+         *":$dir:"*) ;;
+         *) dllsearchpath="$dllsearchpath:$dir";;
+         esac
+         ;;
+       esac
+       continue
+       ;;
+
+      -l*)
+       if test "X$arg" = "X-lc" || test "X$arg" = "X-lm"; then
+         case $host in
+         *-*-cygwin* | *-*-pw32* | *-*-beos*)
+           # These systems don't actually have a C or math library (as such)
+           continue
+           ;;
+         *-*-mingw* | *-*-os2*)
+           # These systems don't actually have a C library (as such)
+           test "X$arg" = "X-lc" && continue
+           ;;
+         esac
+       fi
+       deplibs="$deplibs $arg"
+       continue
+       ;;
+
+      -module)
+       module=yes
+       continue
+       ;;
+
+      -no-fast-install)
+       fast_install=no
+       continue
+       ;;
+
+      -no-install)
+       case $host in
+       *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+         # The PATH hackery in wrapper scripts is required on Windows
+         # in order for the loader to find any dlls it needs.
+         $echo "$modename: warning: \`-no-install' is ignored for $host" 1>&2
+         $echo "$modename: warning: assuming \`-no-fast-install' instead" 1>&2
+         fast_install=no
+         ;;
+       *) no_install=yes ;;
+       esac
+       continue
+       ;;
+
+      -no-undefined)
+       allow_undefined=no
+       continue
+       ;;
+
+      -objectlist)
+       prev=objectlist
+       continue
+       ;;
+
+      -o) prev=output ;;
+
+      -release)
+       prev=release
+       continue
+       ;;
+
+      -rpath)
+       prev=rpath
+       continue
+       ;;
+
+      -R)
+       prev=xrpath
+       continue
+       ;;
+
+      -R*)
+       dir=`$echo "X$arg" | $Xsed -e 's/^-R//'`
+       # We need an absolute path.
+       case $dir in
+       [\\/]* | [A-Za-z]:[\\/]*) ;;
+       *)
+         $echo "$modename: only absolute run-paths are allowed" 1>&2
+         exit 1
+         ;;
+       esac
+       case "$xrpath " in
+       *" $dir "*) ;;
+       *) xrpath="$xrpath $dir" ;;
+       esac
+       continue
+       ;;
+
+      -static)
+       # The effects of -static are defined in a previous loop.
+       # We used to do the same as -all-static on platforms that
+       # didn't have a PIC flag, but the assumption that the effects
+       # would be equivalent was wrong.  It would break on at least
+       # Digital Unix and AIX.
+       continue
+       ;;
+
+      -thread-safe)
+       thread_safe=yes
+       continue
+       ;;
+
+      -version-info)
+       prev=vinfo
+       continue
+       ;;
+
+      -Wc,*)
+       args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wc,//'`
+       arg=
+       IFS="${IFS=     }"; save_ifs="$IFS"; IFS=','
+       for flag in $args; do
+         IFS="$save_ifs"
+         case $flag in
+           *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \       ]*|*]*|"")
+           flag="\"$flag\""
+           ;;
+         esac
+         arg="$arg $wl$flag"
+         compiler_flags="$compiler_flags $flag"
+       done
+       IFS="$save_ifs"
+       arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+       ;;
+
+      -Wl,*)
+       args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wl,//'`
+       arg=
+       IFS="${IFS=     }"; save_ifs="$IFS"; IFS=','
+       for flag in $args; do
+         IFS="$save_ifs"
+         case $flag in
+           *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \       ]*|*]*|"")
+           flag="\"$flag\""
+           ;;
+         esac
+         arg="$arg $wl$flag"
+         compiler_flags="$compiler_flags $wl$flag"
+         linker_flags="$linker_flags $flag"
+       done
+       IFS="$save_ifs"
+       arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+       ;;
+
+      -Xcompiler)
+       prev=xcompiler
+       continue
+       ;;
+
+      -Xlinker)
+       prev=xlinker
+       continue
+       ;;
+
+      # Some other compiler flag.
+      -* | +*)
+       # Unknown arguments in both finalize_command and compile_command need
+       # to be aesthetically quoted because they are evaled later.
+       arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+       case $arg in
+       *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \   ]*|*]*|"")
+         arg="\"$arg\""
+         ;;
+       esac
+       ;;
+
+      *.$objext)
+       # A standard object.
+       objs="$objs $arg"
+       ;;
+
+      *.lo)
+       # A libtool-controlled object.
+
+       # Check to see that this really is a libtool object.
+       if (sed -e '2q' $arg | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+          pic_object=
+          non_pic_object=
+
+          # Read the .lo file
+          # If there is no directory component, then add one.
+          case $arg in
+          */* | *\\*) . $arg ;;
+          *) . ./$arg ;;
+          esac
+
+          if test -z "$pic_object" || \
+             test -z "$non_pic_object" ||
+             test "$pic_object" = none && \
+             test "$non_pic_object" = none; then
+            $echo "$modename: cannot find name of object for \`$arg'" 1>&2
+            exit 1
+          fi
+
+         # Extract subdirectory from the argument.
+         xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+         if test "X$xdir" = "X$arg"; then
+           xdir=
+         else
+           xdir="$xdir/"
+         fi
+
+          if test "$pic_object" != none; then
+            # Prepend the subdirectory the object is found in.
+           pic_object="$xdir$pic_object"
+
+           if test "$prev" = dlfiles; then
+             if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+               dlfiles="$dlfiles $pic_object"
+               prev=
+               continue
+             else
+               # If libtool objects are unsupported, then we need to preload.
+               prev=dlprefiles
+             fi
+           fi
+
+           # CHECK ME:  I think I busted this.  -Ossama
+            if test "$prev" = dlprefiles; then
+             # Preload the old-style object.
+             dlprefiles="$dlprefiles $pic_object"
+             prev=
+            fi
+
+            # A PIC object.
+           libobjs="$libobjs $pic_object"
+           arg="$pic_object"
+          fi
+
+          # Non-PIC object.
+          if test "$non_pic_object" != none; then
+            # Prepend the subdirectory the object is found in.
+           non_pic_object="$xdir$non_pic_object"
+
+            # A standard non-PIC object
+            non_pic_objects="$non_pic_objects $non_pic_object"
+            if test -z "$pic_object" || test "$pic_object" = none ; then
+              arg="$non_pic_object"
+            fi
+          fi
+        else
+          # Only an error if not doing a dry-run.
+          if test -z "$run"; then
+            $echo "$modename: \`$arg' is not a valid libtool object" 1>&2
+            exit 1
+          else
+            # Dry-run case.
+
+           # Extract subdirectory from the argument.
+           xdir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
+           if test "X$xdir" = "X$arg"; then
+             xdir=
+           else
+             xdir="$xdir/"
+           fi
+
+            pic_object=`$echo "X${xdir}${objdir}/${arg}" | $Xsed -e "$lo2o"`
+            non_pic_object=`$echo "X${xdir}${arg}" | $Xsed -e "$lo2o"`
+           libobjs="$libobjs $pic_object"
+            non_pic_objects="$non_pic_objects $non_pic_object"
+          fi
+       fi
+       ;;
+
+      *.$libext)
+       # An archive.
+       deplibs="$deplibs $arg"
+       old_deplibs="$old_deplibs $arg"
+       continue
+       ;;
+
+      *.la)
+       # A libtool-controlled library.
+
+       if test "$prev" = dlfiles; then
+         # This library was specified with -dlopen.
+         dlfiles="$dlfiles $arg"
+         prev=
+       elif test "$prev" = dlprefiles; then
+         # The library was specified with -dlpreopen.
+         dlprefiles="$dlprefiles $arg"
+         prev=
+       else
+         deplibs="$deplibs $arg"
+       fi
+       continue
+       ;;
+
+      # Some other compiler argument.
+      *)
+       # Unknown arguments in both finalize_command and compile_command need
+       # to be aesthetically quoted because they are evaled later.
+       arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+       case $arg in
+       *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \   ]*|*]*|"")
+         arg="\"$arg\""
+         ;;
+       esac
+       ;;
+      esac # arg
+
+      # Now actually substitute the argument into the commands.
+      if test -n "$arg"; then
+       compile_command="$compile_command $arg"
+       finalize_command="$finalize_command $arg"
+      fi
+    done # argument parsing loop
+
+    if test -n "$prev"; then
+      $echo "$modename: the \`$prevarg' option requires an argument" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Infer tagged configuration to use if any are available and
+    # if one wasn't chosen via the "--tag" command line option.
+    # Only attempt this if the compiler in the base link
+    # command doesn't match the default compiler.
+    if test -n "$available_tags" && test -z "$tagname"; then
+      case $base_compile in
+      "$CC "*) ;;
+      # Blanks in the command may have been stripped by the calling shell,
+      # but not from the CC environment variable when ltconfig was run.
+      "`$echo $CC` "*) ;;
+      *)
+        for z in $available_tags; do
+          if grep "^### BEGIN LIBTOOL TAG CONFIG: $z$" < "$0" > /dev/null; then
+           # Evaluate the configuration.
+           eval "`sed -n -e '/^### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^### END LIBTOOL TAG CONFIG: '$z'$/p' < $0`"
+            case $base_compile in
+           "$CC "*)
+              # The compiler in $compile_command matches
+              # the one in the tagged configuration.
+              # Assume this is the tagged configuration we want.
+              tagname=$z
+              break
+             ;;
+           "`$echo $CC` "*)
+             tagname=$z
+             break
+             ;;
+           esac
+          fi
+        done
+        # If $tagname still isn't set, then no tagged configuration
+        # was found and let the user know that the "--tag" command
+        # line option must be used.
+        if test -z "$tagname"; then
+          echo "$modename: unable to infer tagged configuration"
+          echo "$modename: specify a tag with \`--tag'" 1>&2
+         exit 1
+#       else
+#         echo "$modename: using $tagname tagged configuration"
+        fi
+       ;;
+      esac
+    fi
+
+    if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then
+      eval arg=\"$export_dynamic_flag_spec\"
+      compile_command="$compile_command $arg"
+      finalize_command="$finalize_command $arg"
+    fi
+
+    # calculate the name of the file, without its directory
+    outputname=`$echo "X$output" | $Xsed -e 's%^.*/%%'`
+    libobjs_save="$libobjs"
+
+    if test -n "$shlibpath_var"; then
+      # get the directories listed in $shlibpath_var
+      eval shlib_search_path=\`\$echo \"X\${$shlibpath_var}\" \| \$Xsed -e \'s/:/ /g\'\`
+    else
+      shlib_search_path=
+    fi
+    eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
+    eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"
+
+    output_objdir=`$echo "X$output" | $Xsed -e 's%/[^/]*$%%'`
+    if test "X$output_objdir" = "X$output"; then
+      output_objdir="$objdir"
+    else
+      output_objdir="$output_objdir/$objdir"
+    fi
+    # Create the object directory.
+    if test ! -d $output_objdir; then
+      $show "$mkdir $output_objdir"
+      $run $mkdir $output_objdir
+      status=$?
+      if test $status -ne 0 && test ! -d $output_objdir; then
+       exit $status
+      fi
+    fi
+
+    # Determine the type of output
+    case $output in
+    "")
+      $echo "$modename: you must specify an output file" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+      ;;
+    *.$libext) linkmode=oldlib ;;
+    *.lo | *.$objext) linkmode=obj ;;
+    *.la) linkmode=lib ;;
+    *) linkmode=prog ;; # Anything else should be a program.
+    esac
+
+    specialdeplibs=
+    libs=
+    # Find all interdependent deplibs by searching for libraries
+    # that are linked more than once (e.g. -la -lb -la)
+    for deplib in $deplibs; do
+      case "$libs " in
+      *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+      esac
+      libs="$libs $deplib"
+    done
+
+    if test $linkmode = lib; then
+      libs="$predeps $libs $compiler_lib_search_path $postdeps"
+
+      # Compute libraries that are listed more than once in $predeps
+      # $postdeps and mark them as special (i.e., whose duplicates are
+      # not to be eliminated).
+      pre_post_deps=
+      for pre_post_dep in $predeps $postdeps; do
+        case "$pre_post_deps " in
+       *" $pre_post_dep "*) specialdeplibs="$specialdeplibs $pre_post_deps" ;;
+       esac
+       pre_post_deps="$pre_post_deps $pre_post_dep"
+      done
+      pre_post_deps=
+    fi
+
+    deplibs=
+    newdependency_libs=
+    newlib_search_path=
+    need_relink=no # whether we're linking any uninstalled libtool libraries
+    notinst_deplibs= # not-installed libtool libraries
+    notinst_path= # paths that contain not-installed libtool libraries
+    case $linkmode in
+    lib)
+       passes="conv link"
+       for file in $dlfiles $dlprefiles; do
+         case $file in
+         *.la) ;;
+         *)
+           $echo "$modename: libraries can \`-dlopen' only libtool libraries: $file" 1>&2
+           exit 1
+           ;;
+         esac
+       done
+       ;;
+    prog)
+       compile_deplibs=
+       finalize_deplibs=
+       alldeplibs=no
+       newdlfiles=
+       newdlprefiles=
+       passes="conv scan dlopen dlpreopen link"
+       ;;
+    *)  passes="conv"
+       ;;
+    esac
+    for pass in $passes; do
+      if test $linkmode = prog; then
+       # Determine which files to process
+       case $pass in
+       dlopen)
+         libs="$dlfiles"
+         save_deplibs="$deplibs" # Collect dlpreopened libraries
+         deplibs=
+         ;;
+       dlpreopen) libs="$dlprefiles" ;;
+       link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
+       esac
+      fi
+      for deplib in $libs; do
+       lib=
+       found=no
+       case $deplib in
+       -l*)
+         if test $linkmode = oldlib && test $linkmode = obj; then
+           $echo "$modename: warning: \`-l' is ignored for archives/objects: $deplib" 1>&2
+           continue
+         fi
+         if test $pass = conv; then
+           deplibs="$deplib $deplibs"
+           continue
+         fi
+         name=`$echo "X$deplib" | $Xsed -e 's/^-l//'`
+         for searchdir in $newlib_search_path $lib_search_path $sys_lib_search_path $shlib_search_path; do
+           # Search the libtool library
+           lib="$searchdir/lib${name}.la"
+           if test -f "$lib"; then
+             found=yes
+             break
+           fi
+         done
+         if test "$found" != yes; then
+           # deplib doesn't seem to be a libtool library
+           if test "$linkmode,$pass" = "prog,link"; then
+             compile_deplibs="$deplib $compile_deplibs"
+             finalize_deplibs="$deplib $finalize_deplibs"
+           else
+             deplibs="$deplib $deplibs"
+             test $linkmode = lib && newdependency_libs="$deplib $newdependency_libs"
+           fi
+           continue
+         fi
+         ;; # -l
+       -L*)
+         case $linkmode in
+         lib)
+           deplibs="$deplib $deplibs"
+           test $pass = conv && continue
+           newdependency_libs="$deplib $newdependency_libs"
+           newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+           ;;
+         prog)
+           if test $pass = conv; then
+             deplibs="$deplib $deplibs"
+             continue
+           fi
+           if test $pass = scan; then
+             deplibs="$deplib $deplibs"
+             newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+           else
+             compile_deplibs="$deplib $compile_deplibs"
+             finalize_deplibs="$deplib $finalize_deplibs"
+           fi
+           ;;
+         *)
+           $echo "$modename: warning: \`-L' is ignored for archives/objects: $deplib" 1>&2
+           ;;
+         esac # linkmode
+         continue
+         ;; # -L
+       -R*)
+         if test $pass = link; then
+           dir=`$echo "X$deplib" | $Xsed -e 's/^-R//'`
+           # Make sure the xrpath contains only unique directories.
+           case "$xrpath " in
+           *" $dir "*) ;;
+           *) xrpath="$xrpath $dir" ;;
+           esac
+         fi
+         deplibs="$deplib $deplibs"
+         continue
+         ;;
+       *.la) lib="$deplib" ;;
+       *.$libext)
+         if test $pass = conv; then
+           deplibs="$deplib $deplibs"
+           continue
+         fi
+         case $linkmode in
+         lib)
+           if test "$deplibs_check_method" != pass_all; then
+             echo
+             echo "*** Warning: This library needs some functionality provided by $deplib."
+             echo "*** I have the capability to make that library automatically link in when"
+             echo "*** you link to this library.  But I can only do this if you have a"
+             echo "*** shared version of the library, which you do not appear to have."
+           else
+             echo
+             echo "*** Warning: Linking the shared library $output against the"
+             echo "*** static library $deplib is not portable!"
+             deplibs="$deplib $deplibs"
+           fi
+           continue
+           ;;
+         prog)
+           if test $pass != link; then
+             deplibs="$deplib $deplibs"
+           else
+             compile_deplibs="$deplib $compile_deplibs"
+             finalize_deplibs="$deplib $finalize_deplibs"
+           fi
+           continue
+           ;;
+         esac # linkmode
+         ;; # *.$libext
+       *.lo | *.$objext)
+         if test $pass = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+           # If there is no dlopen support or we're linking statically,
+           # we need to preload.
+           newdlprefiles="$newdlprefiles $deplib"
+           compile_deplibs="$deplib $compile_deplibs"
+           finalize_deplibs="$deplib $finalize_deplibs"
+         else
+           newdlfiles="$newdlfiles $deplib"
+         fi
+         continue
+         ;;
+       %DEPLIBS%)
+         alldeplibs=yes
+         continue
+         ;;
+       esac # case $deplib
+       if test $found = yes || test -f "$lib"; then :
+       else
+         $echo "$modename: cannot find the library \`$lib'" 1>&2
+         exit 1
+       fi
+
+       # Check to see that this really is a libtool archive.
+       if (sed -e '2q' $lib | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+       else
+         $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+         exit 1
+       fi
+
+       ladir=`$echo "X$lib" | $Xsed -e 's%/[^/]*$%%'`
+       test "X$ladir" = "X$lib" && ladir="."
+
+       dlname=
+       dlopen=
+       dlpreopen=
+       libdir=
+       library_names=
+       old_library=
+       # If the library was installed with an old release of libtool,
+       # it will not redefine variable installed.
+       installed=yes
+
+       # Read the .la file
+       case $lib in
+       */* | *\\*) . $lib ;;
+       *) . ./$lib ;;
+       esac
+
+       if test "$linkmode,$pass" = "lib,link" ||
+          test "$linkmode,$pass" = "prog,scan" ||
+          { test $linkmode = oldlib && test $linkmode = obj; }; then
+          # Add dl[pre]opened files of deplib
+         test -n "$dlopen" && dlfiles="$dlfiles $dlopen"
+         test -n "$dlpreopen" && dlprefiles="$dlprefiles $dlpreopen"
+       fi
+
+       if test $pass = conv; then
+         # Only check for convenience libraries
+         deplibs="$lib $deplibs"
+         if test -z "$libdir"; then
+           if test -z "$old_library"; then
+             $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+             exit 1
+           fi
+           # It is a libtool convenience library, so add in its objects.
+           convenience="$convenience $ladir/$objdir/$old_library"
+           old_convenience="$old_convenience $ladir/$objdir/$old_library"
+           tmp_libs=
+           for deplib in $dependency_libs; do
+             deplibs="$deplib $deplibs"
+             case "$tmp_libs " in
+             *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+             esac
+             tmp_libs="$tmp_libs $deplib"
+           done
+         elif test $linkmode != prog && test $linkmode != lib; then
+           $echo "$modename: \`$lib' is not a convenience library" 1>&2
+           exit 1
+         fi
+         continue
+       fi # $pass = conv
+
+       # Get the name of the library we link against.
+       linklib=
+       for l in $old_library $library_names; do
+         linklib="$l"
+       done
+       if test -z "$linklib"; then
+         $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+         exit 1
+       fi
+
+       # This library was specified with -dlopen.
+       if test $pass = dlopen; then
+         if test -z "$libdir"; then
+           $echo "$modename: cannot -dlopen a convenience library: \`$lib'" 1>&2
+           exit 1
+         fi
+         if test -z "$dlname" || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+           # If there is no dlname, no dlopen support or we're linking
+           # statically, we need to preload.
+           dlprefiles="$dlprefiles $lib"
+         else
+           newdlfiles="$newdlfiles $lib"
+         fi
+         continue
+       fi # $pass = dlopen
+
+       # We need an absolute path.
+       case $ladir in
+       [\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;;
+       *)
+         abs_ladir=`cd "$ladir" && pwd`
+         if test -z "$abs_ladir"; then
+           $echo "$modename: warning: cannot determine absolute directory name of \`$ladir'" 1>&2
+           $echo "$modename: passing it literally to the linker, although it might fail" 1>&2
+           abs_ladir="$ladir"
+         fi
+         ;;
+       esac
+       laname=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+
+       # Find the relevant object directory and library name.
+       if test "X$installed" = Xyes; then
+         if test ! -f "$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
+           $echo "$modename: warning: library \`$lib' was moved." 1>&2
+           dir="$ladir"
+           absdir="$abs_ladir"
+           libdir="$abs_ladir"
+         else
+           dir="$libdir"
+           absdir="$libdir"
+         fi
+       else
+         dir="$ladir/$objdir"
+         absdir="$abs_ladir/$objdir"
+         # Remove this search path later
+         notinst_path="$notinst_path $abs_ladir"
+       fi # $installed = yes
+       name=`$echo "X$laname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+
+       # This library was specified with -dlpreopen.
+       if test $pass = dlpreopen; then
+         if test -z "$libdir"; then
+           $echo "$modename: cannot -dlpreopen a convenience library: \`$lib'" 1>&2
+           exit 1
+         fi
+         # Prefer using a static library (so that no silly _DYNAMIC symbols
+         # are required to link).
+         if test -n "$old_library"; then
+           newdlprefiles="$newdlprefiles $dir/$old_library"
+         # Otherwise, use the dlname, so that lt_dlopen finds it.
+         elif test -n "$dlname"; then
+           newdlprefiles="$newdlprefiles $dir/$dlname"
+         else
+           newdlprefiles="$newdlprefiles $dir/$linklib"
+         fi
+       fi # $pass = dlpreopen
+
+       if test -z "$libdir"; then
+         # Link the convenience library
+         if test $linkmode = lib; then
+           deplibs="$dir/$old_library $deplibs"
+         elif test "$linkmode,$pass" = "prog,link"; then
+           compile_deplibs="$dir/$old_library $compile_deplibs"
+           finalize_deplibs="$dir/$old_library $finalize_deplibs"
+         else
+           deplibs="$lib $deplibs"
+         fi
+         continue
+       fi
+
+       if test $linkmode = prog && test $pass != link; then
+         newlib_search_path="$newlib_search_path $ladir"
+         deplibs="$lib $deplibs"
+
+         linkalldeplibs=no
+         if test "$link_all_deplibs" != no || test -z "$library_names" ||
+            test "$build_libtool_libs" = no; then
+           linkalldeplibs=yes
+         fi
+
+         tmp_libs=
+         for deplib in $dependency_libs; do
+           case $deplib in
+           -L*) newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`;; ### testsuite: skip nested quoting test
+           esac
+           # Need to link against all dependency_libs?
+           if test $linkalldeplibs = yes; then
+             deplibs="$deplib $deplibs"
+           else
+             # Need to hardcode shared library paths
+             # or/and link against static libraries
+             newdependency_libs="$deplib $newdependency_libs"
+           fi
+           case "$tmp_libs " in
+           *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+           esac
+           tmp_libs="$tmp_libs $deplib"
+         done # for deplib
+         continue
+       fi # $linkmode = prog...
+
+       link_static=no # Whether the deplib will be linked statically
+       if test -n "$library_names" &&
+          { test "$prefer_static_libs" = no || test -z "$old_library"; }; then
+         # Link against this shared library
+
+         if test "$linkmode,$pass" = "prog,link" ||
+          { test $linkmode = lib && test $hardcode_into_libs = yes; }; then
+           # Hardcode the library path.
+           # Skip directories that are in the system default run-time
+           # search path.
+           case " $sys_lib_dlsearch_path " in
+           *" $absdir "*) ;;
+           *)
+             case "$compile_rpath " in
+             *" $absdir "*) ;;
+             *) compile_rpath="$compile_rpath $absdir"
+             esac
+             ;;
+           esac
+           case " $sys_lib_dlsearch_path " in
+           *" $libdir "*) ;;
+           *)
+             case "$finalize_rpath " in
+             *" $libdir "*) ;;
+             *) finalize_rpath="$finalize_rpath $libdir"
+             esac
+             ;;
+           esac
+           if test $linkmode = prog; then
+             # We need to hardcode the library path
+             if test -n "$shlibpath_var"; then
+               # Make sure the rpath contains only unique directories.
+               case "$temp_rpath " in
+               *" $dir "*) ;;
+               *" $absdir "*) ;;
+               *) temp_rpath="$temp_rpath $dir" ;;
+               esac
+             fi
+           fi
+         fi # $linkmode,$pass = prog,link...
+
+         if test "$alldeplibs" = yes &&
+            { test "$deplibs_check_method" = pass_all ||
+              { test "$build_libtool_libs" = yes &&
+                test -n "$library_names"; }; }; then
+           # We only need to search for static libraries
+           continue
+         fi
+
+         if test "$installed" = no; then
+           notinst_deplibs="$notinst_deplibs $lib"
+           need_relink=yes
+         fi
+
+         if test -n "$old_archive_from_expsyms_cmds"; then
+           # figure out the soname
+           set dummy $library_names
+           realname="$2"
+           shift; shift
+           libname=`eval \\$echo \"$libname_spec\"`
+           # use dlname if we got it. it's perfectly good, no?
+           if test -n "$dlname"; then
+             soname="$dlname"
+           elif test -n "$soname_spec"; then
+             # bleh windows
+             case $host in
+             *cygwin*)
+               major=`expr $current - $age`
+               versuffix="-$major"
+               ;;
+             esac
+             eval soname=\"$soname_spec\"
+           else
+             soname="$realname"
+           fi
+
+           # Make a new name for the extract_expsyms_cmds to use
+           soroot="$soname"
+           soname=`echo $soroot | sed -e 's/^.*\///'`
+           newlib="libimp-`echo $soname | sed 's/^lib//;s/\.dll$//'`.a"
+
+           # If the library has no export list, then create one now
+           if test -f "$output_objdir/$soname-def"; then :
+           else
+             $show "extracting exported symbol list from \`$soname'"
+             IFS="${IFS=       }"; save_ifs="$IFS"; IFS='~'
+             eval cmds=\"$extract_expsyms_cmds\"
+             for cmd in $cmds; do
+               IFS="$save_ifs"
+               $show "$cmd"
+               $run eval "$cmd" || exit $?
+             done
+             IFS="$save_ifs"
+           fi
+
+           # Create $newlib
+           if test -f "$output_objdir/$newlib"; then :; else
+             $show "generating import library for \`$soname'"
+             IFS="${IFS=       }"; save_ifs="$IFS"; IFS='~'
+             eval cmds=\"$old_archive_from_expsyms_cmds\"
+             for cmd in $cmds; do
+               IFS="$save_ifs"
+               $show "$cmd"
+               $run eval "$cmd" || exit $?
+             done
+             IFS="$save_ifs"
+           fi
+           # make sure the library variables are pointing to the new library
+           dir=$output_objdir
+           linklib=$newlib
+         fi # test -n $old_archive_from_expsyms_cmds
+
+         if test $linkmode = prog || test "$mode" != relink; then
+           add_shlibpath=
+           add_dir=
+           add=
+           lib_linked=yes
+           case $hardcode_action in
+           immediate | unsupported)
+             if test "$hardcode_direct" = no; then
+               add="$dir/$linklib"
+             elif test "$hardcode_minus_L" = no; then
+               case $host in
+               *-*-sunos*) add_shlibpath="$dir" ;;
+               esac
+               add_dir="-L$dir"
+               add="-l$name"
+             elif test "$hardcode_shlibpath_var" = no; then
+               add_shlibpath="$dir"
+               add="-l$name"
+             else
+               lib_linked=no
+             fi
+             ;;
+           relink)
+             if test "$hardcode_direct" = yes; then
+               add="$dir/$linklib"
+             elif test "$hardcode_minus_L" = yes; then
+               add_dir="-L$dir"
+               add="-l$name"
+             elif test "$hardcode_shlibpath_var" = yes; then
+               add_shlibpath="$dir"
+               add="-l$name"
+             else
+               lib_linked=no
+             fi
+             ;;
+           *) lib_linked=no ;;
+           esac
+
+           if test "$lib_linked" != yes; then
+             $echo "$modename: configuration error: unsupported hardcode properties"
+             exit 1
+           fi
+
+           if test -n "$add_shlibpath"; then
+             case :$compile_shlibpath: in
+             *":$add_shlibpath:"*) ;;
+             *) compile_shlibpath="$compile_shlibpath$add_shlibpath:" ;;
+             esac
+           fi
+           if test $linkmode = prog; then
+             test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
+             test -n "$add" && compile_deplibs="$add $compile_deplibs"
+           else
+             test -n "$add_dir" && deplibs="$add_dir $deplibs"
+             test -n "$add" && deplibs="$add $deplibs"
+             if test "$hardcode_direct" != yes && \
+                test "$hardcode_minus_L" != yes && \
+                test "$hardcode_shlibpath_var" = yes; then
+               case :$finalize_shlibpath: in
+               *":$libdir:"*) ;;
+               *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+               esac
+             fi
+           fi
+         fi
+
+         if test $linkmode = prog || test "$mode" = relink; then
+           add_shlibpath=
+           add_dir=
+           add=
+           # Finalize command for both is simple: just hardcode it.
+           if test "$hardcode_direct" = yes; then
+             add="$libdir/$linklib"
+           elif test "$hardcode_minus_L" = yes; then
+             add_dir="-L$libdir"
+             add="-l$name"
+           elif test "$hardcode_shlibpath_var" = yes; then
+             case :$finalize_shlibpath: in
+             *":$libdir:"*) ;;
+             *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+             esac
+             add="-l$name"
+           else
+             # We cannot seem to hardcode it, guess we'll fake it.
+             add_dir="-L$libdir"
+             add="-l$name"
+           fi
+
+           if test $linkmode = prog; then
+             test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
+             test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
+           else
+             test -n "$add_dir" && deplibs="$add_dir $deplibs"
+             test -n "$add" && deplibs="$add $deplibs"
+           fi
+         fi
+       elif test $linkmode = prog; then
+         if test "$alldeplibs" = yes &&
+            { test "$deplibs_check_method" = pass_all ||
+              { test "$build_libtool_libs" = yes &&
+                test -n "$library_names"; }; }; then
+           # We only need to search for static libraries
+           continue
+         fi
+
+         # Try to link the static library
+         # Here we assume that one of hardcode_direct or hardcode_minus_L
+         # is not unsupported.  This is valid on all known static and
+         # shared platforms.
+         if test "$hardcode_direct" != unsupported; then
+           test -n "$old_library" && linklib="$old_library"
+           compile_deplibs="$dir/$linklib $compile_deplibs"
+           finalize_deplibs="$dir/$linklib $finalize_deplibs"
+         else
+           compile_deplibs="-l$name -L$dir $compile_deplibs"
+           finalize_deplibs="-l$name -L$dir $finalize_deplibs"
+         fi
+       elif test "$build_libtool_libs" = yes; then
+         # Not a shared library
+         if test "$deplibs_check_method" != pass_all; then
+           # We're trying link a shared library against a static one
+           # but the system doesn't support it.
+
+           # Just print a warning and add the library to dependency_libs so
+           # that the program can be linked against the static library.
+           echo
+           echo "*** Warning: This library needs some functionality provided by $lib."
+           echo "*** I have the capability to make that library automatically link in when"
+           echo "*** you link to this library.  But I can only do this if you have a"
+           echo "*** shared version of the library, which you do not appear to have."
+           if test "$module" = yes; then
+             echo "*** Therefore, libtool will create a static module, that should work "
+             echo "*** as long as the dlopening application is linked with the -dlopen flag."
+             if test -z "$global_symbol_pipe"; then
+               echo
+               echo "*** However, this would only work if libtool was able to extract symbol"
+               echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+               echo "*** not find such a program.  So, this module is probably useless."
+               echo "*** \`nm' from GNU binutils and a full rebuild may help."
+             fi
+             if test "$build_old_libs" = no; then
+               build_libtool_libs=module
+               build_old_libs=yes
+             else
+               build_libtool_libs=no
+             fi
+           fi
+         else
+           convenience="$convenience $dir/$old_library"
+           old_convenience="$old_convenience $dir/$old_library"
+           deplibs="$dir/$old_library $deplibs"
+           link_static=yes
+         fi
+       fi # link shared/static library?
+
+       if test $linkmode = lib; then
+         if test -n "$dependency_libs" &&
+            { test $hardcode_into_libs != yes || test $build_old_libs = yes ||
+              test $link_static = yes; }; then
+           # Extract -R from dependency_libs
+           temp_deplibs=
+           for libdir in $dependency_libs; do
+             case $libdir in
+             -R*) temp_xrpath=`$echo "X$libdir" | $Xsed -e 's/^-R//'`
+                  case " $xrpath " in
+                  *" $temp_xrpath "*) ;;
+                  *) xrpath="$xrpath $temp_xrpath";;
+                  esac;;
+             *) temp_deplibs="$temp_deplibs $libdir";;
+             esac
+           done
+           dependency_libs="$temp_deplibs"
+         fi
+
+         newlib_search_path="$newlib_search_path $absdir"
+         # Link against this library
+         test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
+         # ... and its dependency_libs
+         tmp_libs=
+         for deplib in $dependency_libs; do
+           newdependency_libs="$deplib $newdependency_libs"
+           case "$tmp_libs " in
+           *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+           esac
+           tmp_libs="$tmp_libs $deplib"
+         done
+
+         if test $link_all_deplibs != no; then
+           # Add the search paths of all dependency libraries
+           for deplib in $dependency_libs; do
+             case $deplib in
+             -L*) path="$deplib" ;;
+             *.la)
+               dir=`$echo "X$deplib" | $Xsed -e 's%/[^/]*$%%'`
+               test "X$dir" = "X$deplib" && dir="."
+               # We need an absolute path.
+               case $dir in
+               [\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;;
+               *)
+                 absdir=`cd "$dir" && pwd`
+                 if test -z "$absdir"; then
+                   $echo "$modename: warning: cannot determine absolute directory name of \`$dir'" 1>&2
+                   absdir="$dir"
+                 fi
+                 ;;
+               esac
+               if grep "^installed=no" $deplib > /dev/null; then
+                 path="-L$absdir/$objdir"
+               else
+                 eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+                 if test -z "$libdir"; then
+                   $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+                   exit 1
+                 fi
+                 if test "$absdir" != "$libdir"; then
+                   $echo "$modename: warning: \`$deplib' seems to be moved" 1>&2
+                 fi
+                 path="-L$absdir"
+               fi
+               ;;
+             *) continue ;;
+             esac
+             case " $deplibs " in
+             *" $path "*) ;;
+             *) deplibs="$path $deplibs" ;;
+             esac
+           done
+         fi # link_all_deplibs != no
+       fi # linkmode = lib
+      done # for deplib in $libs
+      if test $pass = dlpreopen; then
+       # Link the dlpreopened libraries before other libraries
+       for deplib in $save_deplibs; do
+         deplibs="$deplib $deplibs"
+       done
+      fi
+      if test $pass != dlopen; then
+       test $pass != scan && dependency_libs="$newdependency_libs"
+       if test $pass != conv; then
+         # Make sure lib_search_path contains only unique directories.
+         lib_search_path=
+         for dir in $newlib_search_path; do
+           case "$lib_search_path " in
+           *" $dir "*) ;;
+           *) lib_search_path="$lib_search_path $dir" ;;
+           esac
+         done
+         newlib_search_path=
+       fi
+
+       if test "$linkmode,$pass" != "prog,link"; then
+         vars="deplibs"
+       else
+         vars="compile_deplibs finalize_deplibs"
+       fi
+       for var in $vars dependency_libs; do
+         # Add libraries to $var in reverse order
+         eval tmp_libs=\"\$$var\"
+         new_libs=
+         for deplib in $tmp_libs; do
+           case $deplib in
+           -L*) new_libs="$deplib $new_libs" ;;
+           *)
+             case " $specialdeplibs " in
+             *" $deplib "*) new_libs="$deplib $new_libs" ;;
+             *)
+               case " $new_libs " in
+               *" $deplib "*) ;;
+               *) new_libs="$deplib $new_libs" ;;
+               esac
+               ;;
+             esac
+             ;;
+           esac
+         done
+         tmp_libs=
+         for deplib in $new_libs; do
+           case $deplib in
+           -L*)
+             case " $tmp_libs " in
+             *" $deplib "*) ;;
+             *) tmp_libs="$tmp_libs $deplib" ;;
+             esac
+             ;;
+           *) tmp_libs="$tmp_libs $deplib" ;;
+           esac
+         done
+         eval $var=\"$tmp_libs\"
+       done # for var
+      fi
+      if test "$pass" = "conv" &&
+       { test "$linkmode" = "lib" || test "$linkmode" = "prog"; }; then
+       libs="$deplibs" # reset libs
+       deplibs=
+      fi
+    done # for pass
+    if test $linkmode = prog; then
+      dlfiles="$newdlfiles"
+      dlprefiles="$newdlprefiles"
+    fi
+
+    case $linkmode in
+    oldlib)
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+       $echo "$modename: warning: \`-dlopen' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$rpath"; then
+       $echo "$modename: warning: \`-rpath' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+       $echo "$modename: warning: \`-R' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$vinfo"; then
+       $echo "$modename: warning: \`-version-info' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$release"; then
+       $echo "$modename: warning: \`-release' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+       $echo "$modename: warning: \`-export-symbols' is ignored for archives" 1>&2
+      fi
+
+      # Now set the variables for building old libraries.
+      build_libtool_libs=no
+      oldlibs="$output"
+      objs="$objs$old_deplibs"
+      ;;
+
+    lib)
+      # Make sure we only generate libraries of the form `libNAME.la'.
+      case $outputname in
+      lib*)
+       name=`$echo "X$outputname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+       eval libname=\"$libname_spec\"
+       ;;
+      *)
+       if test "$module" = no; then
+         $echo "$modename: libtool library \`$output' must begin with \`lib'" 1>&2
+         $echo "$help" 1>&2
+         exit 1
+       fi
+       if test "$need_lib_prefix" != no; then
+         # Add the "lib" prefix for modules if required
+         name=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+         eval libname=\"$libname_spec\"
+       else
+         libname=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+       fi
+       ;;
+      esac
+
+      if test -n "$objs"; then
+       if test "$deplibs_check_method" != pass_all; then
+         $echo "$modename: cannot build libtool library \`$output' from non-libtool objects on this host:$objs" 2>&1
+         exit 1
+       else
+         echo
+         echo "*** Warning: Linking the shared library $output against the non-libtool"
+         echo "*** objects $objs is not portable!"
+         libobjs="$libobjs $objs"
+       fi
+      fi
+
+      if test "$dlself" != no; then
+       $echo "$modename: warning: \`-dlopen self' is ignored for libtool libraries" 1>&2
+      fi
+
+      set dummy $rpath
+      if test $# -gt 2; then
+       $echo "$modename: warning: ignoring multiple \`-rpath's for a libtool library" 1>&2
+      fi
+      install_libdir="$2"
+
+      oldlibs=
+      if test -z "$rpath"; then
+       if test "$build_libtool_libs" = yes; then
+         # Building a libtool convenience library.
+         # Some compilers have problems with a `.al' extension so
+          # convenience libraries should have the same extension an
+          # archive normally would.
+         oldlibs="$output_objdir/$libname.$libext $oldlibs"
+         build_libtool_libs=convenience
+         build_old_libs=yes
+       fi
+
+       if test -n "$vinfo"; then
+         $echo "$modename: warning: \`-version-info' is ignored for convenience libraries" 1>&2
+       fi
+
+       if test -n "$release"; then
+         $echo "$modename: warning: \`-release' is ignored for convenience libraries" 1>&2
+       fi
+      else
+
+       # Parse the version information argument.
+       IFS="${IFS=     }"; save_ifs="$IFS"; IFS=':'
+       set dummy $vinfo 0 0 0
+       IFS="$save_ifs"
+
+       if test -n "$8"; then
+         $echo "$modename: too many parameters to \`-version-info'" 1>&2
+         $echo "$help" 1>&2
+         exit 1
+       fi
+
+       current="$2"
+       revision="$3"
+       age="$4"
+
+       # Check that each of the things are valid numbers.
+       case $current in
+       0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+       *)
+         $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
+         $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+         exit 1
+         ;;
+       esac
+
+       case $revision in
+       0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+       *)
+         $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
+         $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+         exit 1
+         ;;
+       esac
+
+       case $age in
+       0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+       *)
+         $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
+         $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+         exit 1
+         ;;
+       esac
+
+       if test $age -gt $current; then
+         $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
+         $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+         exit 1
+       fi
+
+       # Calculate the version variables.
+       major=
+       versuffix=
+       verstring=
+       case $version_type in
+       none) ;;
+
+       darwin)
+         # Like Linux, but with the current version available in
+         # verstring for coding it into the library header
+         major=.`expr $current - $age`
+         versuffix="$major.$age.$revision"
+         # Darwin ld doesn't like 0 for these options...
+         minor_current=`expr $current + 1`
+         verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
+         ;;
+
+       freebsd-aout)
+         major=".$current"
+         versuffix=".$current.$revision";
+         ;;
+
+       freebsd-elf)
+         major=".$current"
+         versuffix=".$current";
+         ;;
+
+       irix)
+         major=`expr $current - $age + 1`
+         verstring="sgi$major.$revision"
+
+         # Add in all the interfaces that we are compatible with.
+         loop=$revision
+         while test $loop != 0; do
+           iface=`expr $revision - $loop`
+           loop=`expr $loop - 1`
+           verstring="sgi$major.$iface:$verstring"
+         done
+
+         # Before this point, $major must not contain `.'.
+         major=.$major
+         versuffix="$major.$revision"
+         ;;
+
+       linux)
+         major=.`expr $current - $age`
+         versuffix="$major.$age.$revision"
+         ;;
+
+       osf)
+         major=`expr $current - $age`
+         versuffix=".$current.$age.$revision"
+         verstring="$current.$age.$revision"
+
+         # Add in all the interfaces that we are compatible with.
+         loop=$age
+         while test $loop != 0; do
+           iface=`expr $current - $loop`
+           loop=`expr $loop - 1`
+           verstring="$verstring:${iface}.0"
+         done
+
+         # Make executables depend on our current version.
+         verstring="$verstring:${current}.0"
+         ;;
+
+       sunos)
+         major=".$current"
+         versuffix=".$current.$revision"
+         ;;
+
+       windows)
+         # Use '-' rather than '.', since we only want one
+         # extension on DOS 8.3 filesystems.
+         major=`expr $current - $age`
+         versuffix="-$major"
+         ;;
+
+       *)
+         $echo "$modename: unknown library version type \`$version_type'" 1>&2
+         echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+         exit 1
+         ;;
+       esac
+
+       # Clear the version info if we defaulted, and they specified a release.
+       if test -z "$vinfo" && test -n "$release"; then
+         major=
+         verstring="0.0"
+         if test "$need_version" = no; then
+           versuffix=
+         else
+           versuffix=".0.0"
+         fi
+       fi
+
+       # Remove version info from name if versioning should be avoided
+       if test "$avoid_version" = yes && test "$need_version" = no; then
+         major=
+         versuffix=
+         verstring=""
+       fi
+
+       # Check to see if the archive will have undefined symbols.
+       if test "$allow_undefined" = yes; then
+         if test "$allow_undefined_flag" = unsupported; then
+           $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
+           build_libtool_libs=no
+           build_old_libs=yes
+         fi
+       else
+         # Don't allow undefined symbols.
+         allow_undefined_flag="$no_undefined_flag"
+       fi
+      fi
+
+      if test "$mode" != relink; then
+       # Remove our outputs, but don't remove object files since they
+        # may have been created when compiling PIC objects.
+        removelist=
+        tempremovelist=`echo "$output_objdir/*"`
+       for p in $tempremovelist; do
+          case $p in
+            *.$objext)
+               ;;
+            $output_objdir/$outputname | $output_objdir/$libname.* | $output_objdir/${libname}${release}.*)
+               removelist="$removelist $p"
+               ;;
+            *) ;;
+          esac
+        done
+        if test -n "$removelist"; then
+         $show "${rm}r $removelist"
+         $run ${rm}r $removelist
+        fi
+      fi
+
+      # Now set the variables for building old libraries.
+      if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then
+       oldlibs="$oldlibs $output_objdir/$libname.$libext"
+
+       # Transform .lo files to .o files.
+       oldobjs="$objs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e "$lo2o" | $NL2SP`
+      fi
+
+      # Eliminate all temporary directories.
+      for path in $notinst_path; do
+       lib_search_path=`echo "$lib_search_path " | sed -e 's% $path % %g'`
+       deplibs=`echo "$deplibs " | sed -e 's% -L$path % %g'`
+       dependency_libs=`echo "$dependency_libs " | sed -e 's% -L$path % %g'`
+      done
+
+      if test -n "$xrpath"; then
+       # If the user specified any rpath flags, then add them.
+       temp_xrpath=
+       for libdir in $xrpath; do
+         temp_xrpath="$temp_xrpath -R$libdir"
+         case "$finalize_rpath " in
+         *" $libdir "*) ;;
+         *) finalize_rpath="$finalize_rpath $libdir" ;;
+         esac
+       done
+       if test $hardcode_into_libs != yes || test $build_old_libs = yes; then
+         dependency_libs="$temp_xrpath $dependency_libs"
+       fi
+      fi
+
+      # Make sure dlfiles contains only unique files that won't be dlpreopened
+      old_dlfiles="$dlfiles"
+      dlfiles=
+      for lib in $old_dlfiles; do
+       case " $dlprefiles $dlfiles " in
+       *" $lib "*) ;;
+       *) dlfiles="$dlfiles $lib" ;;
+       esac
+      done
+
+      # Make sure dlprefiles contains only unique files
+      old_dlprefiles="$dlprefiles"
+      dlprefiles=
+      for lib in $old_dlprefiles; do
+       case "$dlprefiles " in
+       *" $lib "*) ;;
+       *) dlprefiles="$dlprefiles $lib" ;;
+       esac
+      done
+
+      if test "$build_libtool_libs" = yes; then
+       if test -n "$rpath"; then
+         case $host in
+         *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-beos*)
+           # these systems don't actually have a c library (as such)!
+           ;;
+         *-*-rhapsody* | *-*-darwin1.[012])
+           # Rhapsody C library is in the System framework
+           deplibs="$deplibs -framework System"
+           ;;
+         *-*-netbsd*)
+           # Don't link with libc until the a.out ld.so is fixed.
+           ;;
+         *)
+           # Add libc to deplibs on all other systems if necessary.
+           if test $build_libtool_need_lc = "yes"; then
+             deplibs="$deplibs -lc"
+           fi
+           ;;
+         esac
+       fi
+
+       # Transform deplibs into only deplibs that can be linked in shared.
+       name_save=$name
+       libname_save=$libname
+       release_save=$release
+       versuffix_save=$versuffix
+       major_save=$major
+       # I'm not sure if I'm treating the release correctly.  I think
+       # release should show up in the -l (ie -lgmp5) so we don't want to
+       # add it in twice.  Is that correct?
+       release=""
+       versuffix=""
+       major=""
+       newdeplibs=
+       droppeddeps=no
+       case $deplibs_check_method in
+       pass_all)
+         # Don't check for shared/static.  Everything works.
+         # This might be a little naive.  We might want to check
+         # whether the library exists or not.  But this is on
+         # osf3 & osf4 and I'm not really sure... Just
+         # implementing what was already the behaviour.
+         newdeplibs=$deplibs
+         ;;
+       test_compile)
+         # This code stresses the "libraries are programs" paradigm to its
+         # limits. Maybe even breaks it.  We compile a program, linking it
+         # against the deplibs as a proxy for the library.  Then we can check
+         # whether they linked in statically or dynamically with ldd.
+         $rm conftest.c
+         cat > conftest.c <<EOF
+         int main() { return 0; }
+EOF
+         $rm conftest
+         $LTCC -o conftest conftest.c $deplibs
+         if test $? -eq 0 ; then
+           ldd_output=`ldd conftest`
+           for i in $deplibs; do
+             name="`expr $i : '-l\(.*\)'`"
+             # If $name is empty we are operating on a -L argument.
+             if test -n "$name" && test "$name" != "0"; then
+               libname=`eval \\$echo \"$libname_spec\"`
+               deplib_matches=`eval \\$echo \"$library_names_spec\"`
+               set dummy $deplib_matches
+               deplib_match=$2
+               if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+                 newdeplibs="$newdeplibs $i"
+               else
+                 droppeddeps=yes
+                 echo
+                 echo "*** Warning: This library needs some functionality provided by $i."
+                 echo "*** I have the capability to make that library automatically link in when"
+                 echo "*** you link to this library.  But I can only do this if you have a"
+                 echo "*** shared version of the library, which you do not appear to have."
+               fi
+             else
+               newdeplibs="$newdeplibs $i"
+             fi
+           done
+         else
+           # Error occured in the first compile.  Let's try to salvage the situation:
+           # Compile a seperate program for each library.
+           for i in $deplibs; do
+             name="`expr $i : '-l\(.*\)'`"
+            # If $name is empty we are operating on a -L argument.
+             if test -n "$name" && test "$name" != "0"; then
+               $rm conftest
+               $LTCC -o conftest conftest.c $i
+               # Did it work?
+               if test $? -eq 0 ; then
+                 ldd_output=`ldd conftest`
+                 libname=`eval \\$echo \"$libname_spec\"`
+                 deplib_matches=`eval \\$echo \"$library_names_spec\"`
+                 set dummy $deplib_matches
+                 deplib_match=$2
+                 if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+                   newdeplibs="$newdeplibs $i"
+                 else
+                   droppeddeps=yes
+                   echo
+                   echo "*** Warning: This library needs some functionality provided by $i."
+                   echo "*** I have the capability to make that library automatically link in when"
+                   echo "*** you link to this library.  But I can only do this if you have a"
+                   echo "*** shared version of the library, which you do not appear to have."
+                 fi
+               else
+                 droppeddeps=yes
+                 echo
+                 echo "*** Warning!  Library $i is needed by this library but I was not able to"
+                 echo "***  make it link in!  You will probably need to install it or some"
+                 echo "*** library that it depends on before this library will be fully"
+                 echo "*** functional.  Installing it before continuing would be even better."
+               fi
+             else
+               newdeplibs="$newdeplibs $i"
+             fi
+           done
+         fi
+         ;;
+       file_magic*)
+         set dummy $deplibs_check_method
+         file_magic_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+         for a_deplib in $deplibs; do
+           name="`expr $a_deplib : '-l\(.*\)'`"
+           # If $name is empty we are operating on a -L argument.
+           if test -n "$name" && test "$name" != "0"; then
+             libname=`eval \\$echo \"$libname_spec\"`
+             for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+                   potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+                   for potent_lib in $potential_libs; do
+                     # Follow soft links.
+                     if ls -lLd "$potent_lib" 2>/dev/null \
+                        | grep " -> " >/dev/null; then
+                       continue
+                     fi
+                     # The statement above tries to avoid entering an
+                     # endless loop below, in case of cyclic links.
+                     # We might still enter an endless loop, since a link
+                     # loop can be closed while we follow links,
+                     # but so what?
+                     potlib="$potent_lib"
+                     while test -h "$potlib" 2>/dev/null; do
+                       potliblink=`ls -ld $potlib | sed 's/.* -> //'`
+                       case $potliblink in
+                       [\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";;
+                       *) potlib=`$echo "X$potlib" | $Xsed -e 's,[^/]*$,,'`"$potliblink";;
+                       esac
+                     done
+                     if eval $file_magic_cmd \"\$potlib\" 2>/dev/null \
+                        | sed 10q \
+                        | egrep "$file_magic_regex" > /dev/null; then
+                       newdeplibs="$newdeplibs $a_deplib"
+                       a_deplib=""
+                       break 2
+                     fi
+                   done
+             done
+             if test -n "$a_deplib" ; then
+               droppeddeps=yes
+               echo
+               echo "*** Warning: This library needs some functionality provided by $a_deplib."
+               echo "*** I have the capability to make that library automatically link in when"
+               echo "*** you link to this library.  But I can only do this if you have a"
+               echo "*** shared version of the library, which you do not appear to have."
+             fi
+           else
+             # Add a -L argument.
+             newdeplibs="$newdeplibs $a_deplib"
+           fi
+         done # Gone through all deplibs.
+         ;;
+       match_pattern*)
+         set dummy $deplibs_check_method
+         match_pattern_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+         for a_deplib in $deplibs; do
+           name="`expr $a_deplib : '-l\(.*\)'`"
+           # If $name is empty we are operating on a -L argument.
+           if test -n "$name" && test "$name" != "0"; then
+             libname=`eval \\$echo \"$libname_spec\"`
+             for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+               potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+               for potent_lib in $potential_libs; do
+                 if eval echo \"$potent_lib\" 2>/dev/null \
+                     | sed 10q \
+                     | egrep "$match_pattern_regex" > /dev/null; then
+                   newdeplibs="$newdeplibs $a_deplib"
+                   a_deplib=""
+                   break 2
+                 fi
+               done
+             done
+             if test -n "$a_deplib" ; then
+               droppeddeps=yes
+               echo
+               echo "*** Warning: This library needs some functionality provided by $a_deplib."
+               echo "*** I have the capability to make that library automatically link in when"
+               echo "*** you link to this library.  But I can only do this if you have a"
+               echo "*** shared version of the library, which you do not appear to have."
+             fi
+           else
+             # Add a -L argument.
+             newdeplibs="$newdeplibs $a_deplib"
+           fi
+         done # Gone through all deplibs.
+         ;;
+       none | unknown | *)
+         newdeplibs=""
+         if $echo "X $deplibs" | $Xsed -e 's/ -lc$//' \
+              -e 's/ -[LR][^ ]*//g' -e 's/[    ]//g' |
+            grep . >/dev/null; then
+           echo
+           if test "X$deplibs_check_method" = "Xnone"; then
+             echo "*** Warning: inter-library dependencies are not supported in this platform."
+           else
+             echo "*** Warning: inter-library dependencies are not known to be supported."
+           fi
+           echo "*** All declared inter-library dependencies are being dropped."
+           droppeddeps=yes
+         fi
+         ;;
+       esac
+       versuffix=$versuffix_save
+       major=$major_save
+       release=$release_save
+       libname=$libname_save
+       name=$name_save
+
+       case $host in
+       *-*-rhapsody* | *-*-darwin1.[012])
+         # On Rhapsody replace the C library is the System framework
+         newdeplibs=`$echo "X $newdeplibs" | $Xsed -e 's/ -lc / -framework System /'`
+         ;;
+       esac
+
+       if test "$droppeddeps" = yes; then
+         if test "$module" = yes; then
+           echo
+           echo "*** Warning: libtool could not satisfy all declared inter-library"
+           echo "*** dependencies of module $libname.  Therefore, libtool will create"
+           echo "*** a static module, that should work as long as the dlopening"
+           echo "*** application is linked with the -dlopen flag."
+           if test -z "$global_symbol_pipe"; then
+             echo
+             echo "*** However, this would only work if libtool was able to extract symbol"
+             echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+             echo "*** not find such a program.  So, this module is probably useless."
+             echo "*** \`nm' from GNU binutils and a full rebuild may help."
+           fi
+           if test "$build_old_libs" = no; then
+             oldlibs="$output_objdir/$libname.$libext"
+             build_libtool_libs=module
+             build_old_libs=yes
+           else
+             build_libtool_libs=no
+           fi
+         else
+           echo "*** The inter-library dependencies that have been dropped here will be"
+           echo "*** automatically added whenever a program is linked with this library"
+           echo "*** or is declared to -dlopen it."
+
+           if test $allow_undefined = no; then
+             echo
+             echo "*** Since this library must not contain undefined symbols,"
+             echo "*** because either the platform does not support them or"
+             echo "*** it was explicitly requested with -no-undefined,"
+             echo "*** libtool will only create a static version of it."
+             if test "$build_old_libs" = no; then
+               oldlibs="$output_objdir/$libname.$libext"
+               build_libtool_libs=module
+               build_old_libs=yes
+             else
+               build_libtool_libs=no
+             fi
+           fi
+         fi
+       fi
+       # Done checking deplibs!
+       deplibs=$newdeplibs
+      fi
+
+      # All the library-specific variables (install_libdir is set above).
+      library_names=
+      old_library=
+      dlname=
+
+      # Test again, we may have decided not to build it any more
+      if test "$build_libtool_libs" = yes; then
+       if test $hardcode_into_libs = yes; then
+         # Hardcode the library paths
+         hardcode_libdirs=
+         dep_rpath=
+         rpath="$finalize_rpath"
+         test "$mode" != relink && rpath="$compile_rpath$rpath"
+         for libdir in $rpath; do
+           if test -n "$hardcode_libdir_flag_spec"; then
+             if test -n "$hardcode_libdir_separator"; then
+               if test -z "$hardcode_libdirs"; then
+                 hardcode_libdirs="$libdir"
+               else
+                 # Just accumulate the unique libdirs.
+                 case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+                 *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+                   ;;
+                 *)
+                   hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+                   ;;
+                 esac
+               fi
+             else
+               eval flag=\"$hardcode_libdir_flag_spec\"
+               dep_rpath="$dep_rpath $flag"
+             fi
+           elif test -n "$runpath_var"; then
+             case "$perm_rpath " in
+             *" $libdir "*) ;;
+             *) perm_rpath="$perm_rpath $libdir" ;;
+             esac
+           fi
+         done
+         # Substitute the hardcoded libdirs into the rpath.
+         if test -n "$hardcode_libdir_separator" &&
+            test -n "$hardcode_libdirs"; then
+           libdir="$hardcode_libdirs"
+           eval dep_rpath=\"$hardcode_libdir_flag_spec\"
+         fi
+         if test -n "$runpath_var" && test -n "$perm_rpath"; then
+           # We should set the runpath_var.
+           rpath=
+           for dir in $perm_rpath; do
+             rpath="$rpath$dir:"
+           done
+           eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var"
+         fi
+         test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
+       fi
+
+       shlibpath="$finalize_shlibpath"
+       test "$mode" != relink && shlibpath="$compile_shlibpath$shlibpath"
+       if test -n "$shlibpath"; then
+         eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
+       fi
+
+       # Get the real and link names of the library.
+       eval library_names=\"$library_names_spec\"
+       set dummy $library_names
+       realname="$2"
+       shift; shift
+
+       if test -n "$soname_spec"; then
+         eval soname=\"$soname_spec\"
+       else
+         soname="$realname"
+       fi
+       test -z "$dlname" && dlname=$soname
+
+       lib="$output_objdir/$realname"
+       for link
+       do
+         linknames="$linknames $link"
+       done
+
+#      # Ensure that we have .o objects for linkers which dislike .lo
+#      # (e.g. aix) in case we are running --disable-static
+#      for obj in $libobjs; do
+#        xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+#        if test "X$xdir" = "X$obj"; then
+#          xdir="."
+#        else
+#          xdir="$xdir"
+#        fi
+#        baseobj=`$echo "X$obj" | $Xsed -e 's%^.*/%%'`
+#        oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+#        if test ! -f $xdir/$oldobj && test "$baseobj" != "$oldobj"; then
+#          $show "(cd $xdir && ${LN_S} $baseobj $oldobj)"
+#          $run eval '(cd $xdir && ${LN_S} $baseobj $oldobj)' || exit $?
+#        fi
+#      done
+
+       # Use standard objects if they are pic
+       test -z "$pic_flag" && libobjs=`$echo "X$libobjs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+
+       # Prepare the list of exported symbols
+       if test -z "$export_symbols"; then
+         if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then
+           $show "generating symbol list for \`$libname.la'"
+           export_symbols="$output_objdir/$libname.exp"
+           $run $rm $export_symbols
+           eval cmds=\"$export_symbols_cmds\"
+           IFS="${IFS=         }"; save_ifs="$IFS"; IFS='~'
+           for cmd in $cmds; do
+             IFS="$save_ifs"
+             $show "$cmd"
+             $run eval "$cmd" || exit $?
+           done
+           IFS="$save_ifs"
+           if test -n "$export_symbols_regex"; then
+             $show "egrep -e \"$export_symbols_regex\" \"$export_symbols\" > \"${export_symbols}T\""
+             $run eval 'egrep -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
+             $show "$mv \"${export_symbols}T\" \"$export_symbols\""
+             $run eval '$mv "${export_symbols}T" "$export_symbols"'
+           fi
+         fi
+       fi
+
+       if test -n "$export_symbols" && test -n "$include_expsyms"; then
+         $run eval '$echo "X$include_expsyms" | $SP2NL >> "$export_symbols"'
+       fi
+
+       if test -n "$convenience"; then
+         if test -n "$whole_archive_flag_spec"; then
+           save_libobjs=$libobjs
+           eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+         else
+           gentop="$output_objdir/${outputname}x"
+           $show "${rm}r $gentop"
+           $run ${rm}r "$gentop"
+           $show "$mkdir $gentop"
+           $run $mkdir "$gentop"
+           status=$?
+           if test $status -ne 0 && test ! -d "$gentop"; then
+             exit $status
+           fi
+           generated="$generated $gentop"
+
+           for xlib in $convenience; do
+             # Extract the objects.
+             case $xlib in
+             [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+             *) xabs=`pwd`"/$xlib" ;;
+             esac
+             xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+             xdir="$gentop/$xlib"
+
+             $show "${rm}r $xdir"
+             $run ${rm}r "$xdir"
+             $show "$mkdir $xdir"
+             $run $mkdir "$xdir"
+             status=$?
+             if test $status -ne 0 && test ! -d "$xdir"; then
+               exit $status
+             fi
+             $show "(cd $xdir && $AR x $xabs)"
+             $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+             libobjs="$libobjs "`find $xdir -name \*.$objext -print -o -name \*.lo -print | $NL2SP`
+           done
+         fi
+       fi
+
+       if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then
+         eval flag=\"$thread_safe_flag_spec\"
+         linker_flags="$linker_flags $flag"
+       fi
+
+       # Make a backup of the uninstalled library when relinking
+       if test "$mode" = relink; then
+         $run eval '(cd $output_objdir && $rm ${realname}U && $mv $realname ${realname}U)' || exit $?
+       fi
+
+       # Do each of the archive commands.
+       if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+         eval cmds=\"$archive_expsym_cmds\"
+       else
+         eval cmds=\"$archive_cmds\"
+       fi
+        if len=`expr "X$cmds" : ".*"` &&
+           test $len -le $max_cmd_len; then
+          :
+        else
+         # The command line is too long to link in one step, link piecewise.
+          $echo "creating reloadable object files..."
+
+         # Save the value of $output and $libobjs because we want to
+         # use them later.  If we have whole_archive_flag_spec, we
+         # want to use save_libobjs as it was before
+         # whole_archive_flag_spec was expanded, because we can't
+         # assume the linker understands whole_archive_flag_spec.
+         # This may have to be revisited, in case too many
+         # convenience libraries get linked in and end up exceeding
+         # the spec.
+         if test -z "$convenience" || test -z "$whole_archive_flag_spec"; then
+           save_libobjs=$libobjs
+         fi
+          save_output=$output
+
+         # Clear the reloadable object creation command queue and
+         # initialize k to one.
+          test_cmds=
+          concat_cmds=
+          objlist=
+          delfiles=
+          last_robj=
+          k=1
+          output=$output_objdir/$save_output-${k}.$objext
+         # Loop over the list of objects to be linked.
+          for obj in $save_libobjs
+          do
+            eval test_cmds=\"$reload_cmds $objlist $last_robj\"
+            if test "X$objlist" = X ||
+              { len=`expr "X$test_cmds" : ".*"` &&
+                 test $len -le $max_cmd_len; }; then
+              objlist="$objlist $obj"
+            else
+             # The command $test_cmds is almost too long, add a
+             # command to the queue.
+              if test $k -eq 1 ; then
+               # The first file doesn't have a previous command to add.
+                eval concat_cmds=\"$reload_cmds $objlist $last_robj\"
+              else
+               # All subsequent reloadable object files will link in
+               # the last one created.
+                eval concat_cmds=\"\$concat_cmds~$reload_cmds $objlist $last_robj\"
+              fi
+              last_robj=$output_objdir/$save_output-${k}.$objext
+              k=`expr $k + 1`
+              output=$output_objdir/$save_output-${k}.$objext
+              objlist=$obj
+              len=1
+            fi
+          done
+         # Handle the remaining objects by creating one last
+         # reloadable object file.  All subsequent reloadable object
+         # files will link in the last one created.
+         test -z "$concat_cmds" || concat_cmds=$concat_cmds~
+          eval concat_cmds=\"\${concat_cmds}$reload_cmds $objlist $last_robj\"
+
+         # Set up a command to remove the reloadale object files
+         # after they are used.
+          i=0
+          while test $i -lt $k
+          do
+            i=`expr $i + 1`
+            delfiles="$delfiles $output_objdir/$save_output-${i}.$objext"
+          done
+
+          $echo "creating a temporary reloadable object file: $output"
+
+         # Loop through the commands generated above and execute them.
+          IFS="${IFS=  }"; save_ifs="$IFS"; IFS='~'
+          for cmd in $concat_cmds; do
+            IFS="$save_ifs"
+            $show "$cmd"
+            $run eval "$cmd" || exit $?
+          done
+          IFS="$save_ifs"
+
+          libobjs=$output
+         # Restore the value of output.
+          output=$save_output
+
+         if test -n "$convenience" && test -n "$whole_archive_flag_spec"; then
+           eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+         fi
+         # Expand the library linking commands again to reset the
+         # value of $libobjs for piecewise linking.
+
+         # Do each of the archive commands.
+          if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+            eval cmds=\"$archive_expsym_cmds\"
+          else
+            eval cmds=\"$archive_cmds\"
+          fi
+
+         # Append the command to remove the reloadable object files
+         # to the just-reset $cmds.
+          eval cmds=\"\$cmds~$rm $delfiles\"
+        fi
+        IFS="${IFS=    }"; save_ifs="$IFS"; IFS='~'
+        for cmd in $cmds; do
+          IFS="$save_ifs"
+          $show "$cmd"
+          $run eval "$cmd" || exit $?
+        done
+        IFS="$save_ifs"
+
+       # Restore the uninstalled library and exit
+       if test "$mode" = relink; then
+         $run eval '(cd $output_objdir && $rm ${realname}T && $mv $realname ${realname}T && $mv "$realname"U $realname)' || exit $?
+         exit 0
+       fi
+
+       # Create links to the real library.
+       for linkname in $linknames; do
+         if test "$realname" != "$linkname"; then
+           $show "(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)"
+           $run eval '(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)' || exit $?
+         fi
+       done
+
+       # If -module or -export-dynamic was specified, set the dlname.
+       if test "$module" = yes || test "$export_dynamic" = yes; then
+         # On all known operating systems, these are identical.
+         dlname="$soname"
+       fi
+      fi
+      ;;
+
+    obj)
+      if test -n "$deplibs"; then
+       $echo "$modename: warning: \`-l' and \`-L' are ignored for objects" 1>&2
+      fi
+
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+       $echo "$modename: warning: \`-dlopen' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$rpath"; then
+       $echo "$modename: warning: \`-rpath' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+       $echo "$modename: warning: \`-R' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$vinfo"; then
+       $echo "$modename: warning: \`-version-info' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$release"; then
+       $echo "$modename: warning: \`-release' is ignored for objects" 1>&2
+      fi
+
+      case $output in
+      *.lo)
+       if test -n "$objs$old_deplibs"; then
+         $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
+         exit 1
+       fi
+       libobj="$output"
+       obj=`$echo "X$output" | $Xsed -e "$lo2o"`
+       ;;
+      *)
+       libobj=
+       obj="$output"
+       ;;
+      esac
+
+      # Delete the old objects.
+      $run $rm $obj $libobj
+
+      # Objects from convenience libraries.  This assumes
+      # single-version convenience libraries.  Whenever we create
+      # different ones for PIC/non-PIC, this we'll have to duplicate
+      # the extraction.
+      reload_conv_objs=
+      gentop=
+      # reload_cmds runs $LD directly, so let us get rid of
+      # -Wl from whole_archive_flag_spec
+      wl=
+
+      if test -n "$convenience"; then
+       if test -n "$whole_archive_flag_spec"; then
+         eval reload_conv_objs=\"\$reload_objs $whole_archive_flag_spec\"
+       else
+         gentop="$output_objdir/${obj}x"
+         $show "${rm}r $gentop"
+         $run ${rm}r "$gentop"
+         $show "$mkdir $gentop"
+         $run $mkdir "$gentop"
+         status=$?
+         if test $status -ne 0 && test ! -d "$gentop"; then
+           exit $status
+         fi
+         generated="$generated $gentop"
+
+         for xlib in $convenience; do
+           # Extract the objects.
+           case $xlib in
+           [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+           *) xabs=`pwd`"/$xlib" ;;
+           esac
+           xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+           xdir="$gentop/$xlib"
+
+           $show "${rm}r $xdir"
+           $run ${rm}r "$xdir"
+           $show "$mkdir $xdir"
+           $run $mkdir "$xdir"
+           status=$?
+           if test $status -ne 0 && test ! -d "$xdir"; then
+             exit $status
+           fi
+           $show "(cd $xdir && $AR x $xabs)"
+           $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+           reload_conv_objs="$reload_objs "`find $xdir -name \*.$objext -print -o -name \*.lo -print | $NL2SP`
+         done
+       fi
+      fi
+
+      # Create the old-style object.
+      reload_objs="$objs$old_deplibs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}$'/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test
+
+      output="$obj"
+      eval cmds=\"$reload_cmds\"
+      IFS="${IFS=      }"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+       IFS="$save_ifs"
+       $show "$cmd"
+       $run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+
+      # Exit if we aren't doing a library object file.
+      if test -z "$libobj"; then
+       if test -n "$gentop"; then
+         $show "${rm}r $gentop"
+         $run ${rm}r $gentop
+       fi
+
+       exit 0
+      fi
+
+      if test "$build_libtool_libs" != yes; then
+       if test -n "$gentop"; then
+         $show "${rm}r $gentop"
+         $run ${rm}r $gentop
+       fi
+
+       # Create an invalid libtool object if no PIC, so that we don't
+       # accidentally link it into a program.
+       # $show "echo timestamp > $libobj"
+       # $run eval "echo timestamp > $libobj" || exit $?
+       exit 0
+      fi
+
+      if test -n "$pic_flag" || test "$pic_mode" != default; then
+       # Only do commands if we really have different PIC objects.
+       reload_objs="$libobjs $reload_conv_objs"
+       output="$libobj"
+       eval cmds=\"$reload_cmds\"
+       IFS="${IFS=     }"; save_ifs="$IFS"; IFS='~'
+       for cmd in $cmds; do
+         IFS="$save_ifs"
+         $show "$cmd"
+         $run eval "$cmd" || exit $?
+       done
+       IFS="$save_ifs"
+#     else
+#      # Just create a symlink.
+#      $show $rm $libobj
+#      $run $rm $libobj
+#      xdir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+#      if test "X$xdir" = "X$libobj"; then
+#        xdir="."
+#      else
+#        xdir="$xdir"
+#      fi
+#      baseobj=`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+#      oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+#      $show "(cd $xdir && $LN_S $oldobj $baseobj)"
+#      $run eval '(cd $xdir && $LN_S $oldobj $baseobj)' || exit $?
+      fi
+
+      if test -n "$gentop"; then
+       $show "${rm}r $gentop"
+       $run ${rm}r $gentop
+      fi
+
+      exit 0
+      ;;
+
+    prog)
+      case $host in
+       *cygwin*) output=`echo $output | sed -e 's,.exe$,,;s,$,.exe,'` ;;
+      esac
+      if test -n "$vinfo"; then
+       $echo "$modename: warning: \`-version-info' is ignored for programs" 1>&2
+      fi
+
+      if test -n "$release"; then
+       $echo "$modename: warning: \`-release' is ignored for programs" 1>&2
+      fi
+
+      if test "$preload" = yes; then
+       if test "$dlopen_support" = unknown && test "$dlopen_self" = unknown &&
+          test "$dlopen_self_static" = unknown; then
+         $echo "$modename: warning: \`AC_LIBTOOL_DLOPEN' not used. Assuming no dlopen support."
+       fi
+      fi
+
+      case $host in
+      *-*-rhapsody* | *-*-darwin1.[012])
+       # On Rhapsody replace the C library is the System framework
+       compile_deplibs=`$echo "X $compile_deplibs" | $Xsed -e 's/ -lc / -framework System /'`
+       finalize_deplibs=`$echo "X $finalize_deplibs" | $Xsed -e 's/ -lc / -framework System /'`
+       ;;
+      esac
+
+      compile_command="$compile_command $compile_deplibs"
+      finalize_command="$finalize_command $finalize_deplibs"
+
+      if test -n "$rpath$xrpath"; then
+       # If the user specified any rpath flags, then add them.
+       for libdir in $rpath $xrpath; do
+         # This is the magic to use -rpath.
+         case "$finalize_rpath " in
+         *" $libdir "*) ;;
+         *) finalize_rpath="$finalize_rpath $libdir" ;;
+         esac
+       done
+      fi
+
+      # Now hardcode the library paths
+      rpath=
+      hardcode_libdirs=
+      for libdir in $compile_rpath $finalize_rpath; do
+       if test -n "$hardcode_libdir_flag_spec"; then
+         if test -n "$hardcode_libdir_separator"; then
+           if test -z "$hardcode_libdirs"; then
+             hardcode_libdirs="$libdir"
+           else
+             # Just accumulate the unique libdirs.
+             case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+             *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+               ;;
+             *)
+               hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+               ;;
+             esac
+           fi
+         else
+           eval flag=\"$hardcode_libdir_flag_spec\"
+           rpath="$rpath $flag"
+         fi
+       elif test -n "$runpath_var"; then
+         case "$perm_rpath " in
+         *" $libdir "*) ;;
+         *) perm_rpath="$perm_rpath $libdir" ;;
+         esac
+       fi
+       case $host in
+       *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+         case :$dllsearchpath: in
+         *":$libdir:"*) ;;
+         *) dllsearchpath="$dllsearchpath:$libdir";;
+         esac
+         ;;
+       esac
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+        test -n "$hardcode_libdirs"; then
+       libdir="$hardcode_libdirs"
+       eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      compile_rpath="$rpath"
+
+      rpath=
+      hardcode_libdirs=
+      for libdir in $finalize_rpath; do
+       if test -n "$hardcode_libdir_flag_spec"; then
+         if test -n "$hardcode_libdir_separator"; then
+           if test -z "$hardcode_libdirs"; then
+             hardcode_libdirs="$libdir"
+           else
+             # Just accumulate the unique libdirs.
+             case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+             *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+               ;;
+             *)
+               hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+               ;;
+             esac
+           fi
+         else
+           eval flag=\"$hardcode_libdir_flag_spec\"
+           rpath="$rpath $flag"
+         fi
+       elif test -n "$runpath_var"; then
+         case "$finalize_perm_rpath " in
+         *" $libdir "*) ;;
+         *) finalize_perm_rpath="$finalize_perm_rpath $libdir" ;;
+         esac
+       fi
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+        test -n "$hardcode_libdirs"; then
+       libdir="$hardcode_libdirs"
+       eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      finalize_rpath="$rpath"
+
+      dlsyms=
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+       if test -n "$NM" && test -n "$global_symbol_pipe"; then
+         dlsyms="${outputname}S.c"
+       else
+         $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
+       fi
+      fi
+
+      if test -n "$dlsyms"; then
+       case $dlsyms in
+       "") ;;
+       *.c)
+         # Discover the nlist of each of the dlfiles.
+         nlist="$output_objdir/${outputname}.nm"
+
+         $show "$rm $nlist ${nlist}S ${nlist}T"
+         $run $rm "$nlist" "${nlist}S" "${nlist}T"
+
+         # Parse the name list into a source file.
+         $show "creating $output_objdir/$dlsyms"
+
+         test -z "$run" && $echo > "$output_objdir/$dlsyms" "\
+/* $dlsyms - symbol resolution table for \`$outputname' dlsym emulation. */
+/* Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP */
+
+#ifdef __cplusplus
+extern \"C\" {
+#endif
+
+/* Prevent the only kind of declaration conflicts we can make. */
+#define lt_preloaded_symbols some_other_symbol
+
+/* External symbol declarations for the compiler. */\
+"
+
+         if test "$dlself" = yes; then
+           $show "generating symbol list for \`$output'"
+
+           test -z "$run" && $echo ': @PROGRAM@ ' > "$nlist"
+
+           # Add our own program objects to the symbol list.
+           progfiles="$objs$old_deplibs"
+           for arg in $progfiles; do
+             $show "extracting global C symbols from \`$arg'"
+             $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+           done
+
+           if test -n "$exclude_expsyms"; then
+             $run eval 'egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T'
+             $run eval '$mv "$nlist"T "$nlist"'
+           fi
+
+           if test -n "$export_symbols_regex"; then
+             $run eval 'egrep -e "$export_symbols_regex" "$nlist" > "$nlist"T'
+             $run eval '$mv "$nlist"T "$nlist"'
+           fi
+
+           # Prepare the list of exported symbols
+           if test -z "$export_symbols"; then
+             export_symbols="$output_objdir/$output.exp"
+             $run $rm $export_symbols
+             $run eval "sed -n -e '/^: @PROGRAM@$/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
+           else
+             $run eval "sed -e 's/\([][.*^$]\)/\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$output.exp"'
+             $run eval 'grep -f "$output_objdir/$output.exp" < "$nlist" > "$nlist"T'
+             $run eval 'mv "$nlist"T "$nlist"'
+           fi
+         fi
+
+         for arg in $dlprefiles; do
+           $show "extracting global C symbols from \`$arg'"
+           name=`echo "$arg" | sed -e 's%^.*/%%'`
+           $run eval 'echo ": $name " >> "$nlist"'
+           $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+         done
+
+         if test -z "$run"; then
+           # Make sure we have at least an empty file.
+           test -f "$nlist" || : > "$nlist"
+
+           if test -n "$exclude_expsyms"; then
+             egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T
+             $mv "$nlist"T "$nlist"
+           fi
+
+           # Try sorting and uniquifying the output.
+           if grep -v "^: " < "$nlist" | sort +2 | uniq > "$nlist"S; then
+             :
+           else
+             grep -v "^: " < "$nlist" > "$nlist"S
+           fi
+
+           if test -f "$nlist"S; then
+             eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$dlsyms"'
+           else
+             echo '/* NONE */' >> "$output_objdir/$dlsyms"
+           fi
+
+           $echo >> "$output_objdir/$dlsyms" "\
+
+#undef lt_preloaded_symbols
+
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{\
+"
+
+           sed -n -e 's/^: \([^ ]*\) $/  {\"\1\", (lt_ptr_t) 0},/p' \
+               -e 's/^. \([^ ]*\) \([^ ]*\)$/  {"\2", (lt_ptr_t) \&\2},/p' \
+                 < "$nlist" >> "$output_objdir/$dlsyms"
+
+           $echo >> "$output_objdir/$dlsyms" "\
+  {0, (lt_ptr_t) 0}
+};
+
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+  return lt_preloaded_symbols;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif\
+"
+         fi
+
+         pic_flag_for_symtable=
+         case $host in
+         # compiling the symbol table file with pic_flag works around
+         # a FreeBSD bug that causes programs to crash when -lm is
+         # linked before any other PIC object.  But we must not use
+         # pic_flag when linking with -static.  The problem exists in
+         # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
+         *-*-freebsd2*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
+           case "$compile_command " in
+           *" -static "*) ;;
+           *) pic_flag_for_symtable=" $pic_flag -DFREEBSD_WORKAROUND";;
+           esac;;
+         *-*-hpux*)
+           case "$compile_command " in
+           *" -static "*) ;;
+           *) pic_flag_for_symtable=" $pic_flag";;
+           esac
+         esac
+
+         # Now compile the dynamic symbol file.
+         $show "(cd $output_objdir && $LTCC -c$no_builtin_flag$pic_flag_for_symtable \"$dlsyms\")"
+         $run eval '(cd $output_objdir && $LTCC -c$no_builtin_flag$pic_flag_for_symtable "$dlsyms")' || exit $?
+
+         # Clean up the generated files.
+         $show "$rm $output_objdir/$dlsyms $nlist ${nlist}S ${nlist}T"
+         $run $rm "$output_objdir/$dlsyms" "$nlist" "${nlist}S" "${nlist}T"
+
+         # Transform the symbol file into the correct name.
+         compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+         finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+         ;;
+       *)
+         $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
+         exit 1
+         ;;
+       esac
+      else
+       # We keep going just in case the user didn't refer to
+       # lt_preloaded_symbols.  The linker will fail if global_symbol_pipe
+       # really was required.
+
+       # Nullify the symbol file.
+       compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
+       finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
+      fi
+
+      if test $need_relink = no || test "$build_libtool_libs" != yes; then
+       # Replace the output file specification.
+       compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+       link_command="$compile_command$compile_rpath"
+
+       # We have no uninstalled library dependencies, so finalize right now.
+       $show "$link_command"
+       $run eval "$link_command"
+       status=$?
+
+       # Delete the generated files.
+       if test -n "$dlsyms"; then
+         $show "$rm $output_objdir/${outputname}S.${objext}"
+         $run $rm "$output_objdir/${outputname}S.${objext}"
+       fi
+
+       exit $status
+      fi
+
+      if test -n "$shlibpath_var"; then
+       # We should set the shlibpath_var
+       rpath=
+       for dir in $temp_rpath; do
+         case $dir in
+         [\\/]* | [A-Za-z]:[\\/]*)
+           # Absolute path.
+           rpath="$rpath$dir:"
+           ;;
+         *)
+           # Relative path: add a thisdir entry.
+           rpath="$rpath\$thisdir/$dir:"
+           ;;
+         esac
+       done
+       temp_rpath="$rpath"
+      fi
+
+      if test -n "$compile_shlibpath$finalize_shlibpath"; then
+       compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
+      fi
+      if test -n "$finalize_shlibpath"; then
+       finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
+      fi
+
+      compile_var=
+      finalize_var=
+      if test -n "$runpath_var"; then
+       if test -n "$perm_rpath"; then
+         # We should set the runpath_var.
+         rpath=
+         for dir in $perm_rpath; do
+           rpath="$rpath$dir:"
+         done
+         compile_var="$runpath_var=\"$rpath\$$runpath_var\" "
+       fi
+       if test -n "$finalize_perm_rpath"; then
+         # We should set the runpath_var.
+         rpath=
+         for dir in $finalize_perm_rpath; do
+           rpath="$rpath$dir:"
+         done
+         finalize_var="$runpath_var=\"$rpath\$$runpath_var\" "
+       fi
+      fi
+
+      if test "$no_install" = yes; then
+       # We don't need to create a wrapper script.
+       link_command="$compile_var$compile_command$compile_rpath"
+       # Replace the output file specification.
+       link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+       # Delete the old output file.
+       $run $rm $output
+       # Link the executable and exit
+       $show "$link_command"
+       $run eval "$link_command" || exit $?
+       exit 0
+      fi
+
+      if test "$hardcode_action" = relink; then
+       # Fast installation is not supported
+       link_command="$compile_var$compile_command$compile_rpath"
+       relink_command="$finalize_var$finalize_command$finalize_rpath"
+
+       $echo "$modename: warning: this platform does not like uninstalled shared libraries" 1>&2
+       $echo "$modename: \`$output' will be relinked during installation" 1>&2
+      else
+       if test "$fast_install" != no; then
+         link_command="$finalize_var$compile_command$finalize_rpath"
+         if test "$fast_install" = yes; then
+           relink_command=`$echo "X$compile_var$compile_command$compile_rpath" | $Xsed -e 's%@OUTPUT@%\$progdir/\$file%g'`
+         else
+           # fast_install is set to needless
+           relink_command=
+         fi
+       else
+         link_command="$compile_var$compile_command$compile_rpath"
+         relink_command="$finalize_var$finalize_command$finalize_rpath"
+       fi
+      fi
+
+      # Replace the output file specification.
+      link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`
+
+      # Delete the old output files.
+      $run $rm $output $output_objdir/$outputname $output_objdir/lt-$outputname
+
+      $show "$link_command"
+      $run eval "$link_command" || exit $?
+
+      # Now create the wrapper script.
+      $show "creating $output"
+
+      # Quote the relink command for shipping.
+      if test -n "$relink_command"; then
+       # Preserve any variables that may affect compiler behavior
+       for var in $variables_saved_for_relink; do
+         if eval test -z \"\${$var+set}\"; then
+           relink_command="{ test -z \"\${$var+set}\" || unset $var || { $var=; export $var; }; }; $relink_command"
+         elif eval var_value=\$$var; test -z "$var_value"; then
+           relink_command="$var=; export $var; $relink_command"
+         else
+           var_value=`$echo "X$var_value" | $Xsed -e "$sed_quote_subst"`
+           relink_command="$var=\"$var_value\"; export $var; $relink_command"
+         fi
+       done
+       relink_command="cd `pwd`; $relink_command"
+       relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+      fi
+
+      # Quote $echo for shipping.
+      if test "X$echo" = "X$SHELL $0 --fallback-echo"; then
+       case $0 in
+       [\\/]* | [A-Za-z]:[\\/]*) qecho="$SHELL $0 --fallback-echo";;
+       *) qecho="$SHELL `pwd`/$0 --fallback-echo";;
+       esac
+       qecho=`$echo "X$qecho" | $Xsed -e "$sed_quote_subst"`
+      else
+       qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
+      fi
+
+      # Only actually do things if our run command is non-null.
+      if test -z "$run"; then
+       # win32 will think the script is a binary if it has
+       # a .exe suffix, so we strip it off here.
+       case $output in
+         *.exe) output=`echo $output|sed 's,.exe$,,'` ;;
+       esac
+       # test for cygwin because mv fails w/o .exe extensions
+       case $host in
+         *cygwin*) exeext=.exe ;;
+         *) exeext= ;;
+       esac
+       $rm $output
+       trap "$rm $output; exit 1" 1 2 15
+
+       $echo > $output "\
+#! $SHELL
+
+# $output - temporary wrapper script for $objdir/$outputname
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# The $output program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='$sed_quote_subst'
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test \"\${CDPATH+set}\" = set; then CDPATH=:; export CDPATH; fi
+
+relink_command=\"$relink_command\"
+
+# This environment variable determines our operation mode.
+if test \"\$libtool_install_magic\" = \"$magic\"; then
+  # install mode needs the following variable:
+  notinst_deplibs='$notinst_deplibs'
+else
+  # When we are sourced in execute mode, \$file and \$echo are already set.
+  if test \"\$libtool_execute_magic\" != \"$magic\"; then
+    echo=\"$qecho\"
+    file=\"\$0\"
+    # Make sure echo works.
+    if test \"X\$1\" = X--no-reexec; then
+      # Discard the --no-reexec flag, and continue.
+      shift
+    elif test \"X\`(\$echo '\t') 2>/dev/null\`\" = 'X\t'; then
+      # Yippee, \$echo works!
+      :
+    else
+      # Restart under the correct shell, and then maybe \$echo will work.
+      exec $SHELL \"\$0\" --no-reexec \${1+\"\$@\"}
+    fi
+  fi\
+"
+       $echo >> $output "\
+
+  # Find the directory that this script lives in.
+  thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\`
+  test \"x\$thisdir\" = \"x\$file\" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\`
+  while test -n \"\$file\"; do
+    destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\`
+
+    # If there was a directory component, then change thisdir.
+    if test \"x\$destdir\" != \"x\$file\"; then
+      case \"\$destdir\" in
+      [\\\\/]* | [A-Za-z]:[\\\\/]*) thisdir=\"\$destdir\" ;;
+      *) thisdir=\"\$thisdir/\$destdir\" ;;
+      esac
+    fi
+
+    file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\`
+    file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\`
+  done
+
+  # Try to get the absolute directory name.
+  absdir=\`cd \"\$thisdir\" && pwd\`
+  test -n \"\$absdir\" && thisdir=\"\$absdir\"
+"
+
+       if test "$fast_install" = yes; then
+         echo >> $output "\
+  program=lt-'$outputname'$exeext
+  progdir=\"\$thisdir/$objdir\"
+
+  if test ! -f \"\$progdir/\$program\" || \\
+     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | sed 1q\`; \\
+       test \"X\$file\" != \"X\$progdir/\$program\"; }; then
+
+    file=\"\$\$-\$program\"
+
+    if test ! -d \"\$progdir\"; then
+      $mkdir \"\$progdir\"
+    else
+      $rm \"\$progdir/\$file\"
+    fi"
+
+         echo >> $output "\
+
+    # relink executable if necessary
+    if test -n \"\$relink_command\"; then
+      if relink_command_output=\`eval \$relink_command 2>&1\`; then :
+      else
+        $echo \"\$relink_command_output\" >&2
+       $rm \"\$progdir/\$file\"
+       exit 1
+      fi
+    fi
+
+    $mv \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null ||
+    { $rm \"\$progdir/\$program\";
+      $mv \"\$progdir/\$file\" \"\$progdir/\$program\"; }
+    $rm \"\$progdir/\$file\"
+  fi"
+       else
+         echo >> $output "\
+  program='$outputname'
+  progdir=\"\$thisdir/$objdir\"
+"
+       fi
+
+       echo >> $output "\
+
+  if test -f \"\$progdir/\$program\"; then"
+
+       # Export our shlibpath_var if we have one.
+       if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+         $echo >> $output "\
+    # Add our own library path to $shlibpath_var
+    $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
+
+    # Some systems cannot cope with colon-terminated $shlibpath_var
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/::*\$//'\`
+
+    export $shlibpath_var
+"
+       fi
+
+       # fixup the dll searchpath if we need to.
+       if test -n "$dllsearchpath"; then
+         $echo >> $output "\
+    # Add the dll search path components to the executable PATH
+    PATH=$dllsearchpath:\$PATH
+"
+       fi
+
+       $echo >> $output "\
+    if test \"\$libtool_execute_magic\" != \"$magic\"; then
+      # Run the actual program with our arguments.
+"
+       case $host in
+       # win32 systems need to use the prog path for dll
+       # lookup to work
+       *-*-cygwin* | *-*-pw32*)
+         $echo >> $output "\
+      exec \$progdir/\$program \${1+\"\$@\"}
+"
+         ;;
+
+       # Backslashes separate directories on plain windows
+       *-*-mingw | *-*-os2*)
+         $echo >> $output "\
+      exec \$progdir\\\\\$program \${1+\"\$@\"}
+"
+         ;;
+
+       *)
+         $echo >> $output "\
+      # Export the path to the program.
+      PATH=\"\$progdir:\$PATH\"
+      export PATH
+
+      exec \$program \${1+\"\$@\"}
+"
+         ;;
+       esac
+       $echo >> $output "\
+      \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\"
+      exit 1
+    fi
+  else
+    # The program doesn't exist.
+    \$echo \"\$0: error: \$progdir/\$program does not exist\" 1>&2
+    \$echo \"This script is just a wrapper for \$program.\" 1>&2
+    echo \"See the $PACKAGE documentation for more information.\" 1>&2
+    exit 1
+  fi
+fi\
+"
+       chmod +x $output
+      fi
+      exit 0
+      ;;
+    esac
+
+    # See if we need to build an old-fashioned archive.
+    for oldlib in $oldlibs; do
+
+      if test "$build_libtool_libs" = convenience; then
+       oldobjs="$libobjs_save"
+       addlibs="$convenience"
+       build_libtool_libs=no
+      else
+       if test "$build_libtool_libs" = module; then
+         oldobjs="$libobjs_save"
+         build_libtool_libs=no
+       else
+         oldobjs="$objs$old_deplibs $non_pic_objects"
+       fi
+       addlibs="$old_convenience"
+      fi
+
+      if test -n "$addlibs"; then
+       gentop="$output_objdir/${outputname}x"
+       $show "${rm}r $gentop"
+       $run ${rm}r "$gentop"
+       $show "$mkdir $gentop"
+       $run $mkdir "$gentop"
+       status=$?
+       if test $status -ne 0 && test ! -d "$gentop"; then
+         exit $status
+       fi
+       generated="$generated $gentop"
+
+       # Add in members from convenience archives.
+       for xlib in $addlibs; do
+         # Extract the objects.
+         case $xlib in
+         [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+         *) xabs=`pwd`"/$xlib" ;;
+         esac
+         xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+         xdir="$gentop/$xlib"
+
+         $show "${rm}r $xdir"
+         $run ${rm}r "$xdir"
+         $show "$mkdir $xdir"
+         $run $mkdir "$xdir"
+         status=$?
+         if test $status -ne 0 && test ! -d "$xdir"; then
+           exit $status
+         fi
+         $show "(cd $xdir && $AR x $xabs)"
+         $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+         oldobjs="$oldobjs "`find $xdir -name \*.${objext} -print | $NL2SP`
+       done
+      fi
+
+      # Do each command in the archive commands.
+      if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
+       eval cmds=\"$old_archive_from_new_cmds\"
+      else
+#      # Ensure that we have .o objects in place in case we decided
+#      # not to build a shared library, and have fallen back to building
+#      # static libs even though --disable-static was passed!
+#      for oldobj in $oldobjs; do
+#        if test ! -f $oldobj; then
+#          xdir=`$echo "X$oldobj" | $Xsed -e 's%/[^/]*$%%'`
+#          if test "X$xdir" = "X$oldobj"; then
+#            xdir="."
+#          else
+#            xdir="$xdir"
+#          fi
+#          baseobj=`$echo "X$oldobj" | $Xsed -e 's%^.*/%%'`
+#          obj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+#          $show "(cd $xdir && ${LN_S} $obj $baseobj)"
+#          $run eval '(cd $xdir && ${LN_S} $obj $baseobj)' || exit $?
+#        fi
+#      done
+
+        eval cmds=\"$old_archive_cmds\"
+
+        if len=`expr "X$cmds" : ".*"` &&
+             test $len -le $max_cmd_len; then
+          :
+        else
+          # the command line is too long to link in one step, link in parts
+          $echo "using piecewise archive linking..."
+         save_RANLIB=$RANLIB
+         RANLIB=:
+          objlist=
+          concat_cmds=
+          save_oldobjs=$oldobjs
+          for obj in $save_oldobjs
+          do
+            oldobjs="$objlist $obj"
+            objlist="$objlist $obj"
+            eval test_cmds=\"$old_archive_cmds\"
+            if len=`expr "X$test_cmds" : ".*"` &&
+               test $len -le $max_cmd_len; then
+              :
+            else
+              # the above command should be used before it gets too long
+              oldobjs=$objlist
+             test -z "$concat_cmds" || concat_cmds=$concat_cmds~
+              eval concat_cmds=\"\${concat_cmds}$old_archive_cmds\"
+              objlist=
+            fi
+          done
+         RANLIB=$save_RANLIB
+          oldobjs=$objlist
+          eval cmds=\"\$concat_cmds~$old_archive_cmds\"
+        fi
+      fi
+      IFS="${IFS=      }"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+       IFS="$save_ifs"
+       $show "$cmd"
+       $run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+    done
+
+    if test -n "$generated"; then
+      $show "${rm}r$generated"
+      $run ${rm}r$generated
+    fi
+
+    # Now create the libtool archive.
+    case $output in
+    *.la)
+      old_library=
+      test "$build_old_libs" = yes && old_library="$libname.$libext"
+      $show "creating $output"
+
+      # Preserve any variables that may affect compiler behavior
+      for var in $variables_saved_for_relink; do
+       if eval test -z \"\${$var+set}\"; then
+         relink_command="{ test -z \"\${$var+set}\" || unset $var || { $var=; export $var; }; }; $relink_command"
+       elif eval var_value=\$$var; test -z "$var_value"; then
+         relink_command="$var=; export $var; $relink_command"
+       else
+         var_value=`$echo "X$var_value" | $Xsed -e "$sed_quote_subst"`
+         relink_command="$var=\"$var_value\"; export $var; $relink_command"
+       fi
+      done
+      # Quote the link command for shipping.
+      relink_command="cd `pwd`; $SHELL $0 --mode=relink $libtool_args"
+      relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+
+      # Only create the output if not a dry run.
+      if test -z "$run"; then
+       for installed in no yes; do
+         if test "$installed" = yes; then
+           if test -z "$install_libdir"; then
+             break
+           fi
+           output="$output_objdir/$outputname"i
+           # Replace all uninstalled libtool libraries with the installed ones
+           newdependency_libs=
+           for deplib in $dependency_libs; do
+             case $deplib in
+             *.la)
+               name=`$echo "X$deplib" | $Xsed -e 's%^.*/%%'`
+               eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+               if test -z "$libdir"; then
+                 $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+                 exit 1
+               fi
+               newdependency_libs="$newdependency_libs $libdir/$name"
+               ;;
+             *) newdependency_libs="$newdependency_libs $deplib" ;;
+             esac
+           done
+           dependency_libs="$newdependency_libs"
+           newdlfiles=
+           for lib in $dlfiles; do
+             name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+             eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+             if test -z "$libdir"; then
+               $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+               exit 1
+             fi
+             newdlfiles="$newdlfiles $libdir/$name"
+           done
+           dlfiles="$newdlfiles"
+           newdlprefiles=
+           for lib in $dlprefiles; do
+             name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+             eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+             if test -z "$libdir"; then
+               $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+               exit 1
+             fi
+             newdlprefiles="$newdlprefiles $libdir/$name"
+           done
+           dlprefiles="$newdlprefiles"
+         fi
+         $rm $output
+         # place dlname in correct position for cygwin
+         tdlname=$dlname
+         case $host,$output,$installed,$module,$dlname in
+           *cygwin*,*lai,yes,no,*.dll) tdlname=../bin/$dlname ;;
+         esac
+         $echo > $output "\
+# $outputname - a libtool library file
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='$tdlname'
+
+# Names of this library.
+library_names='$library_names'
+
+# The name of the static archive.
+old_library='$old_library'
+
+# Libraries that this one depends upon.
+dependency_libs='$dependency_libs'
+
+# Version information for $libname.
+current=$current
+age=$age
+revision=$revision
+
+# Is this an already installed library?
+installed=$installed
+
+# Files to dlopen/dlpreopen
+dlopen='$dlfiles'
+dlpreopen='$dlprefiles'
+
+# Directory that this library needs to be installed in:
+libdir='$install_libdir'"
+         if test "$installed" = no && test $need_relink = yes; then
+           $echo >> $output "\
+relink_command=\"$relink_command\""
+         fi
+       done
+      fi
+
+      # Do a symbolic link so that the libtool archive can be found in
+      # LD_LIBRARY_PATH before the program is installed.
+      $show "(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)"
+      $run eval '(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)' || exit $?
+      ;;
+    esac
+    exit 0
+    ;;
+
+  # libtool install mode
+  install)
+    modename="$modename: install"
+
+    # There may be an optional sh(1) argument at the beginning of
+    # install_prog (especially on Windows NT).
+    if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh ||
+       # Allow the use of GNU shtool's install command.
+       $echo "X$nonopt" | $Xsed | grep shtool > /dev/null; then
+      # Aesthetically quote it.
+      arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"`
+      case $arg in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \    ]*|*]*)
+       arg="\"$arg\""
+       ;;
+      esac
+      install_prog="$arg "
+      arg="$1"
+      shift
+    else
+      install_prog=
+      arg="$nonopt"
+    fi
+
+    # The real first argument should be the name of the installation program.
+    # Aesthetically quote it.
+    arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+    case $arg in
+    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \      ]*|*]*)
+      arg="\"$arg\""
+      ;;
+    esac
+    install_prog="$install_prog$arg"
+
+    # We need to accept at least all the BSD install flags.
+    dest=
+    files=
+    opts=
+    prev=
+    install_type=
+    isdir=no
+    stripme=
+    for arg
+    do
+      if test -n "$dest"; then
+       files="$files $dest"
+       dest="$arg"
+       continue
+      fi
+
+      case $arg in
+      -d) isdir=yes ;;
+      -f) prev="-f" ;;
+      -g) prev="-g" ;;
+      -m) prev="-m" ;;
+      -o) prev="-o" ;;
+      -s)
+       stripme=" -s"
+       continue
+       ;;
+      -*) ;;
+
+      *)
+       # If the previous option needed an argument, then skip it.
+       if test -n "$prev"; then
+         prev=
+       else
+         dest="$arg"
+         continue
+       fi
+       ;;
+      esac
+
+      # Aesthetically quote the argument.
+      arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+      case $arg in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \    ]*|*]*)
+       arg="\"$arg\""
+       ;;
+      esac
+      install_prog="$install_prog $arg"
+    done
+
+    if test -z "$install_prog"; then
+      $echo "$modename: you must specify an install program" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test -n "$prev"; then
+      $echo "$modename: the \`$prev' option requires an argument" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test -z "$files"; then
+      if test -z "$dest"; then
+       $echo "$modename: no file or destination specified" 1>&2
+      else
+       $echo "$modename: you must specify a destination" 1>&2
+      fi
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Strip any trailing slash from the destination.
+    dest=`$echo "X$dest" | $Xsed -e 's%/$%%'`
+
+    # Check to see that the destination is a directory.
+    test -d "$dest" && isdir=yes
+    if test "$isdir" = yes; then
+      destdir="$dest"
+      destname=
+    else
+      destdir=`$echo "X$dest" | $Xsed -e 's%/[^/]*$%%'`
+      test "X$destdir" = "X$dest" && destdir=.
+      destname=`$echo "X$dest" | $Xsed -e 's%^.*/%%'`
+
+      # Not a directory, so check to see that there is only one file specified.
+      set dummy $files
+      if test $# -gt 2; then
+       $echo "$modename: \`$dest' is not a directory" 1>&2
+       $echo "$help" 1>&2
+       exit 1
+      fi
+    fi
+    case $destdir in
+    [\\/]* | [A-Za-z]:[\\/]*) ;;
+    *)
+      for file in $files; do
+       case $file in
+       *.lo) ;;
+       *)
+         $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
+         $echo "$help" 1>&2
+         exit 1
+         ;;
+       esac
+      done
+      ;;
+    esac
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    staticlibs=
+    future_libdirs=
+    current_libdirs=
+    for file in $files; do
+
+      # Do each installation.
+      case $file in
+      *.$libext)
+       # Do the static libraries later.
+       staticlibs="$staticlibs $file"
+       ;;
+
+      *.la)
+       # Check to see that this really is a libtool archive.
+       if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+       else
+         $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
+         $echo "$help" 1>&2
+         exit 1
+       fi
+
+       library_names=
+       old_library=
+       relink_command=
+       # If there is no directory component, then add one.
+       case $file in
+       */* | *\\*) . $file ;;
+       *) . ./$file ;;
+       esac
+
+       # Add the libdir to current_libdirs if it is the destination.
+       if test "X$destdir" = "X$libdir"; then
+         case "$current_libdirs " in
+         *" $libdir "*) ;;
+         *) current_libdirs="$current_libdirs $libdir" ;;
+         esac
+       else
+         # Note the libdir as a future libdir.
+         case "$future_libdirs " in
+         *" $libdir "*) ;;
+         *) future_libdirs="$future_libdirs $libdir" ;;
+         esac
+       fi
+
+       dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/
+       test "X$dir" = "X$file/" && dir=
+       dir="$dir$objdir"
+
+       if test -n "$relink_command"; then
+         $echo "$modename: warning: relinking \`$file'" 1>&2
+         $show "$relink_command"
+         if $run eval "$relink_command"; then :
+         else
+           $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+           continue
+         fi
+       fi
+
+       # See the names of the shared library.
+       set dummy $library_names
+       if test -n "$2"; then
+         realname="$2"
+         shift
+         shift
+
+         srcname="$realname"
+         test -n "$relink_command" && srcname="$realname"T
+
+         # Install the shared library and build the symlinks.
+         $show "$install_prog $dir/$srcname $destdir/$realname"
+         $run eval "$install_prog $dir/$srcname $destdir/$realname" || exit $?
+         if test -n "$stripme" && test -n "$striplib"; then
+           $show "$striplib $destdir/$realname"
+           $run eval "$striplib $destdir/$realname" || exit $?
+         fi
+
+         if test $# -gt 0; then
+           # Delete the old symlinks, and create new ones.
+           for linkname
+           do
+             if test "$linkname" != "$realname"; then
+               $show "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+               $run eval "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+             fi
+           done
+         fi
+
+         # Do each command in the postinstall commands.
+         lib="$destdir/$realname"
+         eval cmds=\"$postinstall_cmds\"
+         IFS="${IFS=   }"; save_ifs="$IFS"; IFS='~'
+         for cmd in $cmds; do
+           IFS="$save_ifs"
+           $show "$cmd"
+           $run eval "$cmd" || exit $?
+         done
+         IFS="$save_ifs"
+       fi
+
+       # Install the pseudo-library for information purposes.
+       name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+       instname="$dir/$name"i
+       $show "$install_prog $instname $destdir/$name"
+       $run eval "$install_prog $instname $destdir/$name" || exit $?
+
+       # Maybe install the static library, too.
+       test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
+       ;;
+
+      *.lo)
+       # Install (i.e. copy) a libtool object.
+
+       # Figure out destination file name, if it wasn't already specified.
+       if test -n "$destname"; then
+         destfile="$destdir/$destname"
+       else
+         destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+         destfile="$destdir/$destfile"
+       fi
+
+       # Deduce the name of the destination old-style object file.
+       case $destfile in
+       *.lo)
+         staticdest=`$echo "X$destfile" | $Xsed -e "$lo2o"`
+         ;;
+       *.$objext)
+         staticdest="$destfile"
+         destfile=
+         ;;
+       *)
+         $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
+         $echo "$help" 1>&2
+         exit 1
+         ;;
+       esac
+
+       # Install the libtool object if requested.
+       if test -n "$destfile"; then
+         $show "$install_prog $file $destfile"
+         $run eval "$install_prog $file $destfile" || exit $?
+       fi
+
+       # Install the old object if enabled.
+       if test "$build_old_libs" = yes; then
+         # Deduce the name of the old-style object file.
+         staticobj=`$echo "X$file" | $Xsed -e "$lo2o"`
+
+         $show "$install_prog $staticobj $staticdest"
+         $run eval "$install_prog \$staticobj \$staticdest" || exit $?
+       fi
+       exit 0
+       ;;
+
+      *)
+       # Figure out destination file name, if it wasn't already specified.
+       if test -n "$destname"; then
+         destfile="$destdir/$destname"
+       else
+         destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+         destfile="$destdir/$destfile"
+       fi
+
+       # Do a test to see if this is really a libtool program.
+       if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+         notinst_deplibs=
+         relink_command=
+
+         # If there is no directory component, then add one.
+         case $file in
+         */* | *\\*) . $file ;;
+         *) . ./$file ;;
+         esac
+
+         # Check the variables that should have been set.
+         if test -z "$notinst_deplibs"; then
+           $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2
+           exit 1
+         fi
+
+         finalize=yes
+         for lib in $notinst_deplibs; do
+           # Check to see that each library is installed.
+           libdir=
+           if test -f "$lib"; then
+             # If there is no directory component, then add one.
+             case $lib in
+             */* | *\\*) . $lib ;;
+             *) . ./$lib ;;
+             esac
+           fi
+           libfile="$libdir/"`$echo "X$lib" | $Xsed -e 's%^.*/%%g'` ### testsuite: skip nested quoting test
+           if test -n "$libdir" && test ! -f "$libfile"; then
+             $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
+             finalize=no
+           fi
+         done
+
+         relink_command=
+         # If there is no directory component, then add one.
+         case $file in
+         */* | *\\*) . $file ;;
+         *) . ./$file ;;
+         esac
+
+         outputname=
+         if test "$fast_install" = no && test -n "$relink_command"; then
+           if test "$finalize" = yes && test -z "$run"; then
+             tmpdir="/tmp"
+             test -n "$TMPDIR" && tmpdir="$TMPDIR"
+             tmpdir="$tmpdir/libtool-$$"
+             if $mkdir -p "$tmpdir" && chmod 700 "$tmpdir"; then :
+             else
+               $echo "$modename: error: cannot create temporary directory \`$tmpdir'" 1>&2
+               continue
+             fi
+             file=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+             outputname="$tmpdir/$file"
+             # Replace the output file specification.
+             relink_command=`$echo "X$relink_command" | $Xsed -e 's%@OUTPUT@%'"$outputname"'%g'`
+
+             $show "$relink_command"
+             if $run eval "$relink_command"; then :
+             else
+               $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+               ${rm}r "$tmpdir"
+               continue
+             fi
+             file="$outputname"
+           else
+             $echo "$modename: warning: cannot relink \`$file'" 1>&2
+           fi
+         else
+           # Install the binary that we compiled earlier.
+           file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"`
+         fi
+       fi
+
+
+       # remove .exe since cygwin /usr/bin/install will append another
+       # one anyways
+       case $install_prog,$host in
+       */usr/bin/install*,*cygwin*)
+         case $file:$destfile in
+         *.exe:*.exe)
+           # this is ok
+           ;;
+         *.exe:*)
+           destfile=$destfile.exe
+           ;;
+         *:*.exe)
+           destfile=`echo $destfile | sed -e 's,.exe$,,'`
+           ;;
+         esac
+         ;;
+       esac
+
+       $show "$install_prog$stripme $file $destfile"
+       $run eval "$install_prog\$stripme \$file \$destfile" || exit $?
+       test -n "$outputname" && ${rm}r "$tmpdir"
+       ;;
+      esac
+    done
+
+    for file in $staticlibs; do
+      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+
+      # Set up the ranlib parameters.
+      oldlib="$destdir/$name"
+
+      $show "$install_prog $file $oldlib"
+      $run eval "$install_prog \$file \$oldlib" || exit $?
+
+      if test -n "$stripme" && test -n "$striplib"; then
+       $show "$old_striplib $oldlib"
+       $run eval "$old_striplib $oldlib" || exit $?
+      fi
+
+      # Do each command in the postinstall commands.
+      eval cmds=\"$old_postinstall_cmds\"
+      IFS="${IFS=      }"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+       IFS="$save_ifs"
+       $show "$cmd"
+       $run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+    done
+
+    if test -n "$future_libdirs"; then
+      $echo "$modename: warning: remember to run \`$progname --finish$future_libdirs'" 1>&2
+    fi
+
+    if test -n "$current_libdirs"; then
+      # Maybe just do a dry run.
+      test -n "$run" && current_libdirs=" -n$current_libdirs"
+      exec_cmd='$SHELL $0 --finish$current_libdirs'
+    else
+      exit 0
+    fi
+    ;;
+
+  # libtool finish mode
+  finish)
+    modename="$modename: finish"
+    libdirs="$nonopt"
+    admincmds=
+
+    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
+      for dir
+      do
+       libdirs="$libdirs $dir"
+      done
+
+      for libdir in $libdirs; do
+       if test -n "$finish_cmds"; then
+         # Do each command in the finish commands.
+         eval cmds=\"$finish_cmds\"
+         IFS="${IFS=   }"; save_ifs="$IFS"; IFS='~'
+         for cmd in $cmds; do
+           IFS="$save_ifs"
+           $show "$cmd"
+           $run eval "$cmd" || admincmds="$admincmds
+       $cmd"
+         done
+         IFS="$save_ifs"
+       fi
+       if test -n "$finish_eval"; then
+         # Do the single finish_eval.
+         eval cmds=\"$finish_eval\"
+         $run eval "$cmds" || admincmds="$admincmds
+       $cmds"
+       fi
+      done
+    fi
+
+    # Exit here if they wanted silent mode.
+    test "$show" = ":" && exit 0
+
+    echo "----------------------------------------------------------------------"
+    echo "Libraries have been installed in:"
+    for libdir in $libdirs; do
+      echo "   $libdir"
+    done
+    echo
+    echo "If you ever happen to want to link against installed libraries"
+    echo "in a given directory, LIBDIR, you must either use libtool, and"
+    echo "specify the full pathname of the library, or use the \`-LLIBDIR'"
+    echo "flag during linking and do at least one of the following:"
+    if test -n "$shlibpath_var"; then
+      echo "   - add LIBDIR to the \`$shlibpath_var' environment variable"
+      echo "     during execution"
+    fi
+    if test -n "$runpath_var"; then
+      echo "   - add LIBDIR to the \`$runpath_var' environment variable"
+      echo "     during linking"
+    fi
+    if test -n "$hardcode_libdir_flag_spec"; then
+      libdir=LIBDIR
+      eval flag=\"$hardcode_libdir_flag_spec\"
+
+      echo "   - use the \`$flag' linker flag"
+    fi
+    if test -n "$admincmds"; then
+      echo "   - have your system administrator run these commands:$admincmds"
+    fi
+    if test -f /etc/ld.so.conf; then
+      echo "   - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
+    fi
+    echo
+    echo "See any operating system documentation about shared libraries for"
+    echo "more information, such as the ld(1) and ld.so(8) manual pages."
+    echo "----------------------------------------------------------------------"
+    exit 0
+    ;;
+
+  # libtool execute mode
+  execute)
+    modename="$modename: execute"
+
+    # The first argument is the command name.
+    cmd="$nonopt"
+    if test -z "$cmd"; then
+      $echo "$modename: you must specify a COMMAND" 1>&2
+      $echo "$help"
+      exit 1
+    fi
+
+    # Handle -dlopen flags immediately.
+    for file in $execute_dlfiles; do
+      if test ! -f "$file"; then
+       $echo "$modename: \`$file' is not a file" 1>&2
+       $echo "$help" 1>&2
+       exit 1
+      fi
+
+      dir=
+      case $file in
+      *.la)
+       # Check to see that this really is a libtool archive.
+       if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+       else
+         $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+         $echo "$help" 1>&2
+         exit 1
+       fi
+
+       # Read the libtool library.
+       dlname=
+       library_names=
+
+       # If there is no directory component, then add one.
+       case $file in
+       */* | *\\*) . $file ;;
+       *) . ./$file ;;
+       esac
+
+       # Skip this library if it cannot be dlopened.
+       if test -z "$dlname"; then
+         # Warn if it was a shared library.
+         test -n "$library_names" && $echo "$modename: warning: \`$file' was not linked with \`-export-dynamic'"
+         continue
+       fi
+
+       dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+       test "X$dir" = "X$file" && dir=.
+
+       if test -f "$dir/$objdir/$dlname"; then
+         dir="$dir/$objdir"
+       else
+         $echo "$modename: cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'" 1>&2
+         exit 1
+       fi
+       ;;
+
+      *.lo)
+       # Just add the directory containing the .lo file.
+       dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+       test "X$dir" = "X$file" && dir=.
+       ;;
+
+      *)
+       $echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2
+       continue
+       ;;
+      esac
+
+      # Get the absolute pathname.
+      absdir=`cd "$dir" && pwd`
+      test -n "$absdir" && dir="$absdir"
+
+      # Now add the directory to shlibpath_var.
+      if eval "test -z \"\$$shlibpath_var\""; then
+       eval "$shlibpath_var=\"\$dir\""
+      else
+       eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
+      fi
+    done
+
+    # This variable tells wrapper scripts just to set shlibpath_var
+    # rather than running their programs.
+    libtool_execute_magic="$magic"
+
+    # Check if any of the arguments is a wrapper script.
+    args=
+    for file
+    do
+      case $file in
+      -*) ;;
+      *)
+       # Do a test to see if this is really a libtool program.
+       if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+         # If there is no directory component, then add one.
+         case $file in
+         */* | *\\*) . $file ;;
+         *) . ./$file ;;
+         esac
+
+         # Transform arg to wrapped name.
+         file="$progdir/$program"
+       fi
+       ;;
+      esac
+      # Quote arguments (to preserve shell metacharacters).
+      file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"`
+      args="$args \"$file\""
+    done
+
+    if test -z "$run"; then
+      if test -n "$shlibpath_var"; then
+       # Export the shlibpath_var.
+       eval "export $shlibpath_var"
+      fi
+
+      # Restore saved enviroment variables
+      if test "${save_LC_ALL+set}" = set; then
+       LC_ALL="$save_LC_ALL"; export LC_ALL
+      fi
+      if test "${save_LANG+set}" = set; then
+       LANG="$save_LANG"; export LANG
+      fi
+
+      # Now prepare to actually exec the command.
+      exec_cmd='"$cmd"$args'
+    else
+      # Display what would be done.
+      if test -n "$shlibpath_var"; then
+       eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
+       $echo "export $shlibpath_var"
+      fi
+      $echo "$cmd$args"
+      exit 0
+    fi
+    ;;
+
+  # libtool clean and uninstall mode
+  clean | uninstall)
+    modename="$modename: $mode"
+    rm="$nonopt"
+    files=
+    rmforce=
+    exit_status=0
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    for arg
+    do
+      case $arg in
+      -f) rm="$rm $arg"; rmforce=yes ;;
+      -*) rm="$rm $arg" ;;
+      *) files="$files $arg" ;;
+      esac
+    done
+
+    if test -z "$rm"; then
+      $echo "$modename: you must specify an RM program" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    rmdirs=
+
+    for file in $files; do
+      dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+      if test "X$dir" = "X$file"; then
+       dir=.
+       objdir="$objdir"
+      else
+       objdir="$dir/$objdir"
+      fi
+      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+      test $mode = uninstall && objdir="$dir"
+
+      # Remember objdir for removal later, being careful to avoid duplicates
+      if test $mode = clean; then
+       case " $rmdirs " in
+         *" $objdir "*) ;;
+         *) rmdirs="$rmdirs $objdir" ;;
+       esac
+      fi
+
+      # Don't error if the file doesn't exist and rm -f was used.
+      if (test -L "$file") >/dev/null 2>&1 \
+        || (test -h "$file") >/dev/null 2>&1 \
+       || test -f "$file"; then
+        :
+      elif test -d "$file"; then
+        exit_status=1
+       continue
+      elif test "$rmforce" = yes; then
+        continue
+      fi
+
+      rmfiles="$file"
+
+      case $name in
+      *.la)
+       # Possibly a libtool archive, so verify it.
+       if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+         . $dir/$name
+
+         # Delete the libtool libraries and symlinks.
+         for n in $library_names; do
+           rmfiles="$rmfiles $objdir/$n"
+         done
+         test -n "$old_library" && rmfiles="$rmfiles $objdir/$old_library"
+         test $mode = clean && rmfiles="$rmfiles $objdir/$name $objdir/${name}i"
+
+         if test $mode = uninstall; then
+           if test -n "$library_names"; then
+             # Do each command in the postuninstall commands.
+             eval cmds=\"$postuninstall_cmds\"
+             IFS="${IFS=       }"; save_ifs="$IFS"; IFS='~'
+             for cmd in $cmds; do
+               IFS="$save_ifs"
+               $show "$cmd"
+               $run eval "$cmd"
+               if test $? != 0 && test "$rmforce" != yes; then
+                 exit_status=1
+               fi
+             done
+             IFS="$save_ifs"
+           fi
+
+           if test -n "$old_library"; then
+             # Do each command in the old_postuninstall commands.
+             eval cmds=\"$old_postuninstall_cmds\"
+             IFS="${IFS=       }"; save_ifs="$IFS"; IFS='~'
+             for cmd in $cmds; do
+               IFS="$save_ifs"
+               $show "$cmd"
+               $run eval "$cmd"
+               if test $? != 0 && test "$rmforce" != yes; then
+                 exit_status=1
+               fi
+             done
+             IFS="$save_ifs"
+           fi
+           # FIXME: should reinstall the best remaining shared library.
+         fi
+       fi
+       ;;
+
+      *.lo)
+       # Possibly a libtool object, so verify it.
+       if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+
+          # Read the .lo file
+          . $dir/$name
+
+         # Add PIC object to the list of files to remove.
+          if test -n "$pic_object" \
+             && test "$pic_object" != none; then
+           rmfiles="$rmfiles $dir/$pic_object"
+          fi
+
+         # Add non-PIC object to the list of files to remove.
+          if test -n "$non_pic_object" \
+             && test "$non_pic_object" != none; then
+           rmfiles="$rmfiles $dir/$non_pic_object"
+          fi
+       fi
+       ;;
+
+      *)
+       # Do a test to see if this is a libtool program.
+       if test $mode = clean &&
+          (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+         relink_command=
+         . $dir/$file
+
+         rmfiles="$rmfiles $objdir/$name $objdir/${name}S.${objext}"
+         if test "$fast_install" = yes && test -n "$relink_command"; then
+           rmfiles="$rmfiles $objdir/lt-$name"
+         fi
+       fi
+       ;;
+      esac
+      $show "$rm $rmfiles"
+      $run $rm $rmfiles || exit_status=1
+    done
+
+    # Try to remove the ${objdir}s in the directories where we deleted files
+    for dir in $rmdirs; do
+      if test -d "$dir"; then
+       $show "rmdir $dir"
+       $run rmdir $dir >/dev/null 2>&1
+      fi
+    done
+
+    exit $exit_status
+    ;;
+
+  "")
+    $echo "$modename: you must specify a MODE" 1>&2
+    $echo "$generic_help" 1>&2
+    exit 1
+    ;;
+  esac
+
+  if test -z "$exec_cmd"; then
+    $echo "$modename: invalid operation mode \`$mode'" 1>&2
+    $echo "$generic_help" 1>&2
+    exit 1
+  fi
+fi # test -z "$show_help"
+
+if test -n "$exec_cmd"; then
+  eval exec $exec_cmd
+  exit 1
+fi
+
+# We need to display help for each of the modes.
+case $mode in
+"") $echo \
+"Usage: $modename [OPTION]... [MODE-ARG]...
+
+Provide generalized library-building support services.
+
+    --config          show all configuration variables
+    --debug           enable verbose shell tracing
+-n, --dry-run         display commands without modifying any files
+    --features        display basic configuration information and exit
+    --finish          same as \`--mode=finish'
+    --help            display this help message and exit
+    --mode=MODE       use operation mode MODE [default=inferred from MODE-ARGS]
+    --quiet           same as \`--silent'
+    --silent          don't print informational messages
+    --tag=TAG         use configuration variables from tag TAG
+    --version         print version information
+
+MODE must be one of the following:
+
+      clean           remove files from the build directory
+      compile         compile a source file into a libtool object
+      execute         automatically set library path, then run a program
+      finish          complete the installation of libtool libraries
+      install         install libraries or executables
+      link            create a library or an executable
+      uninstall       remove libraries from an installed directory
+
+MODE-ARGS vary depending on the MODE.  Try \`$modename --help --mode=MODE' for
+a more detailed description of MODE."
+  exit 0
+  ;;
+
+clean)
+  $echo \
+"Usage: $modename [OPTION]... --mode=clean RM [RM-OPTION]... FILE...
+
+Remove files from the build directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, object or program, all the files associated
+with it are deleted. Otherwise, only FILE itself is deleted using RM."
+  ;;
+
+compile)
+  $echo \
+"Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
+
+Compile a source file into a libtool library object.
+
+This mode accepts the following additional options:
+
+  -o OUTPUT-FILE    set the output file name to OUTPUT-FILE
+  -prefer-pic       try to building PIC objects only
+  -prefer-non-pic   try to building non-PIC objects only
+  -static           always build a \`.o' file suitable for static linking
+
+COMPILE-COMMAND is a command to be used in creating a \`standard' object file
+from the given SOURCEFILE.
+
+The output file name is determined by removing the directory component from
+SOURCEFILE, then substituting the C source code suffix \`.c' with the
+library object suffix, \`.lo'."
+  ;;
+
+execute)
+  $echo \
+"Usage: $modename [OPTION]... --mode=execute COMMAND [ARGS]...
+
+Automatically set library path, then run a program.
+
+This mode accepts the following additional options:
+
+  -dlopen FILE      add the directory containing FILE to the library path
+
+This mode sets the library path environment variable according to \`-dlopen'
+flags.
+
+If any of the ARGS are libtool executable wrappers, then they are translated
+into their corresponding uninstalled binary, and any of their required library
+directories are added to the library path.
+
+Then, COMMAND is executed, with ARGS as arguments."
+  ;;
+
+finish)
+  $echo \
+"Usage: $modename [OPTION]... --mode=finish [LIBDIR]...
+
+Complete the installation of libtool libraries.
+
+Each LIBDIR is a directory that contains libtool libraries.
+
+The commands that this mode executes may require superuser privileges.  Use
+the \`--dry-run' option if you just want to see what would be executed."
+  ;;
+
+install)
+  $echo \
+"Usage: $modename [OPTION]... --mode=install INSTALL-COMMAND...
+
+Install executables or libraries.
+
+INSTALL-COMMAND is the installation command.  The first component should be
+either the \`install' or \`cp' program.
+
+The rest of the components are interpreted as arguments to that command (only
+BSD-compatible install options are recognized)."
+  ;;
+
+link)
+  $echo \
+"Usage: $modename [OPTION]... --mode=link LINK-COMMAND...
+
+Link object files or libraries together to form another library, or to
+create an executable program.
+
+LINK-COMMAND is a command using the C compiler that you would use to create
+a program from several object files.
+
+The following components of LINK-COMMAND are treated specially:
+
+  -all-static       do not do any dynamic linking at all
+  -avoid-version    do not add a version suffix if possible
+  -dlopen FILE      \`-dlpreopen' FILE if it cannot be dlopened at runtime
+  -dlpreopen FILE   link in FILE and add its symbols to lt_preloaded_symbols
+  -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
+  -export-symbols SYMFILE
+                   try to export only the symbols listed in SYMFILE
+  -export-symbols-regex REGEX
+                   try to export only the symbols matching REGEX
+  -LLIBDIR          search LIBDIR for required installed libraries
+  -lNAME            OUTPUT-FILE requires the installed library libNAME
+  -module           build a library that can dlopened
+  -no-fast-install  disable the fast-install mode
+  -no-install       link a not-installable executable
+  -no-undefined     declare that a library does not refer to external symbols
+  -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
+  -objectlist FILE  Use a list of object files found in FILE to specify objects
+  -release RELEASE  specify package release information
+  -rpath LIBDIR     the created library will eventually be installed in LIBDIR
+  -R[ ]LIBDIR       add LIBDIR to the runtime path of programs and libraries
+  -static           do not do any dynamic linking of libtool libraries
+  -version-info CURRENT[:REVISION[:AGE]]
+                   specify library version info [each variable defaults to 0]
+
+All other options (arguments beginning with \`-') are ignored.
+
+Every other argument is treated as a filename.  Files ending in \`.la' are
+treated as uninstalled libtool libraries, other files are standard or library
+object files.
+
+If the OUTPUT-FILE ends in \`.la', then a libtool library is created,
+only library objects (\`.lo' files) may be specified, and \`-rpath' is
+required, except when creating a convenience library.
+
+If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created
+using \`ar' and \`ranlib', or on Windows using \`lib'.
+
+If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file
+is created, otherwise an executable program is created."
+  ;;
+
+uninstall)
+  $echo \
+"Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
+
+Remove libraries from an installation directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, all the files associated with it are deleted.
+Otherwise, only FILE itself is deleted using RM."
+  ;;
+
+*)
+  $echo "$modename: invalid operation mode \`$mode'" 1>&2
+  $echo "$help" 1>&2
+  exit 1
+  ;;
+esac
+
+echo
+$echo "Try \`$modename --help' for more information about other modes."
+
+exit 0
+
+# The TAGs below are defined such that we never get into a situation
+# in which we disable both kinds of libraries.  Given conflicting
+# choices, we go for a static library, that is the most portable,
+# since we can't tell whether shared libraries were disabled because
+# the user asked for that or because the platform doesn't support
+# them.  This is particularly important on AIX, because we don't
+# support having both static and shared libraries enabled at the same
+# time on that platform, so we default to a shared-only configuration.
+# If a disable-shared tag is given, we'll fallback to a static-only
+# configuration.  But we'll never go from static-only to shared-only.
+
+### BEGIN LIBTOOL TAG CONFIG: disable-shared
+build_libtool_libs=no
+build_old_libs=yes
+### END LIBTOOL TAG CONFIG: disable-shared
+
+### BEGIN LIBTOOL TAG CONFIG: disable-static
+build_old_libs=`case $build_libtool_libs in yes) echo no;; *) echo yes;; esac`
+### END LIBTOOL TAG CONFIG: disable-static
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
diff --git a/configure b/configure

index cb3ed5cbd9a721c76dc3498f84690fb92116739b..7e1a721de39b897f1dac01b7af9114cb5d7f2efc 100755 (executable)
--- a/configure
+++ b/configure
@@ -11,18 +11,15 @@
  ac_help=
  ac_default_prefix=/usr/local
  # Any additions from configure.in:
-ac_help="$ac_help
-  --disable-dependency-tracking Speeds up one-time builds
-  --enable-dependency-tracking  Do not reject slow dependency extractors"
  ac_default_prefix=/usr/local/gromacs
  ac_help="$ac_help
-  --enable-mpi                  Compile parallel version of Gromacs"
+  --enable-mpi                  Compile parallel version of GROMACS"
  ac_help="$ac_help
    --enable-vector               Compile for a vector machine"
  ac_help="$ac_help
-  --enable-fortran              Dortran loops (default on sgi,ibm,sun,tru64/dec)"
+  --enable-fortran              Fortran loops (default on sgi,ibm,sun,axp)"
  ac_help="$ac_help
-  --enable-double               Compile double precision Gromacs"
+  --enable-float                Compile single precision GROMACS"
  ac_help="$ac_help
    --disable-type-suffix         Don't add a suffix to double precision files"
  ac_help="$ac_help
@@ -34,9 +31,7 @@ ac_help="$ac_help
  ac_help="$ac_help
    --disable-nice                Disable the nice priority on mdrun"
  ac_help="$ac_help
-  --disable-sse                 Disable SSE assembly loops on x86"
-ac_help="$ac_help
-  --disable-3dnow               Disable 3DNow assembly loops on x86"
+  --disable-x86-asm             Disable assembly loops on x86"
  ac_help="$ac_help
    --disable-xdr                 Disable portable trajectory routines"
  ac_help="$ac_help
@@ -60,12 +55,24 @@ ac_help="$ac_help
    --enable-hide-table-latency   Try to get table data to cache before using it"
  ac_help="$ac_help
    --with-mpi-environment=VAR    Only start MPI mdrun when VAR is set"
+ac_help="$ac_help
+  --enable-shared[=PKGS]        build shared libraries [default=yes]"
+ac_help="$ac_help
+  --enable-static[=PKGS]        build static libraries [default=yes]"
+ac_help="$ac_help
+  --enable-fast-install[=PKGS]  optimize for fast installation [default=yes]"
+ac_help="$ac_help
+  --with-gnu-ld                 assume the C compiler uses GNU ld [default=no]"
+ac_help="$ac_help
+  --disable-libtool-lock        avoid locking (might break parallel builds)"
+ac_help="$ac_help
+  --with-pic                    try to use only PIC/non-PIC [default=both]"
  ac_help="$ac_help
    --with-x                use the X Window System"
  ac_help="$ac_help
-  --with-motif-includes=DIR    Motif include files are in DIR"
+  --with-motif-includes=DIR     Motif include files are in DIR"
  ac_help="$ac_help
-  --with-motif-libraries=DIR   Motif libraries are in DIR"
+  --with-motif-libraries=DIR    Motif libraries are in DIR"
  
  # Initialize some variables set by options.
  # The variables have the same names as the options, with
@@ -575,10 +582,72 @@ else
    ac_n= ac_c='\c' ac_t=
  fi
  
+echo $ac_n "checking for Cygwin environment""... $ac_c" 1>&6
+echo "configure:587: checking for Cygwin environment" >&5
+if eval "test \"`echo '$''{'ac_cv_cygwin'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 592 "configure"
+#include "confdefs.h"
+
+int main() {
+
+#ifndef __CYGWIN__
+#define __CYGWIN__ __CYGWIN32__
+#endif
+return __CYGWIN__;
+; return 0; }
+EOF
+if { (eval echo configure:603: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_cygwin=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_cygwin=no
+fi
+rm -f conftest*
+rm -f conftest*
+fi
+
+echo "$ac_t""$ac_cv_cygwin" 1>&6
+CYGWIN=
+test "$ac_cv_cygwin" = yes && CYGWIN=yes
+echo $ac_n "checking for mingw32 environment""... $ac_c" 1>&6
+echo "configure:620: checking for mingw32 environment" >&5
+if eval "test \"`echo '$''{'ac_cv_mingw32'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 625 "configure"
+#include "confdefs.h"
+
+int main() {
+return __MINGW32__;
+; return 0; }
+EOF
+if { (eval echo configure:632: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_mingw32=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_mingw32=no
+fi
+rm -f conftest*
+rm -f conftest*
+fi
+
+echo "$ac_t""$ac_cv_mingw32" 1>&6
+MINGW32=
+test "$ac_cv_mingw32" = yes && MINGW32=yes
  
  
  ac_aux_dir=
-for ac_dir in ./config $srcdir/./config; do
+for ac_dir in config $srcdir/config; do
    if test -f $ac_dir/install-sh; then
      ac_aux_dir=$ac_dir
      ac_install_sh="$ac_aux_dir/install-sh -c"
@@ -590,7 +659,7 @@ for ac_dir in ./config $srcdir/./config; do
    fi
  done
  if test -z "$ac_aux_dir"; then
-  { echo "configure: error: can not find install-sh or install.sh in ./config $srcdir/./config" 1>&2; exit 1; }
+  { echo "configure: error: can not find install-sh or install.sh in config $srcdir/config" 1>&2; exit 1; }
  fi
  ac_config_guess=$ac_aux_dir/config.guess
  ac_config_sub=$ac_aux_dir/config.sub
@@ -608,7 +677,7 @@ ac_configure=$ac_aux_dir/configure # This should be Cygnus configure.
  # SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
  # ./install, which can be erroneously created by make from ./install.sh.
  echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6
-echo "configure:612: checking for a BSD compatible install" >&5
+echo "configure:681: checking for a BSD compatible install" >&5
  if test -z "$INSTALL"; then
  if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -661,23 +730,23 @@ test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}'
  test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
  
  echo $ac_n "checking whether build environment is sane""... $ac_c" 1>&6
-echo "configure:665: checking whether build environment is sane" >&5
+echo "configure:734: checking whether build environment is sane" >&5
  # Just in case
  sleep 1
-echo timestamp > conftest.file
+echo timestamp > conftestfile
  # Do `set' in a subshell so we don't clobber the current shell's
  # arguments.  Must try -L first in case configure is actually a
  # symlink; some systems play weird games with the mod time of symlinks
  # (eg FreeBSD returns the mod time of the symlink's containing
  # directory).
  if (
-   set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
+   set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
     if test "$*" = "X"; then
        # -L didn't work.
-      set X `ls -t $srcdir/configure conftest.file`
+      set X `ls -t $srcdir/configure conftestfile`
     fi
-   if test "$*" != "X $srcdir/configure conftest.file" \
-      && test "$*" != "X conftest.file $srcdir/configure"; then
+   if test "$*" != "X $srcdir/configure conftestfile" \
+      && test "$*" != "X conftestfile $srcdir/configure"; then
  
        # If neither matched, then we have a broken ls.  This can happen
        # if, for instance, CONFIG_SHELL is bash and it inherits a
@@ -687,7 +756,7 @@ if (
  alias in your environment" 1>&2; exit 1; }
     fi
  
-   test "$2" = conftest.file
+   test "$2" = conftestfile
     )
  then
     # Ok.
@@ -717,53 +786,8 @@ test "$program_suffix" != NONE &&
  # sed with no file args requires a program.
  test "$program_transform_name" = "" && program_transform_name="s,x,x,"
  
-test x"${MISSING+set}" = xset ||
-  MISSING="\${SHELL} `CDPATH=:; cd $ac_aux_dir && pwd`/missing"
-# Use eval to expand $SHELL
-if eval "$MISSING --run :"; then
-  am_missing_run="$MISSING --run "
-else
-  am_missing_run=
-  am_backtick='`'
-  echo "configure: warning: ${am_backtick}missing' script is too old or missing" 1>&2
-fi
-
-for ac_prog in mawk gawk nawk awk
-do
-# Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:737: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_AWK'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  if test -n "$AWK"; then
-  ac_cv_prog_AWK="$AWK" # Let the user override the test.
-else
-  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
-  ac_dummy="$PATH"
-  for ac_dir in $ac_dummy; do
-    test -z "$ac_dir" && ac_dir=.
-    if test -f $ac_dir/$ac_word; then
-      ac_cv_prog_AWK="$ac_prog"
-      break
-    fi
-  done
-  IFS="$ac_save_ifs"
-fi
-fi
-AWK="$ac_cv_prog_AWK"
-if test -n "$AWK"; then
-  echo "$ac_t""$AWK" 1>&6
-else
-  echo "$ac_t""no" 1>&6
-fi
-
-test -n "$AWK" && break
-done
-
  echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6
-echo "configure:767: checking whether ${MAKE-make} sets \${MAKE}" >&5
+echo "configure:791: checking whether ${MAKE-make} sets \${MAKE}" >&5
  set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -789,51 +813,14 @@ else
    SET_MAKE="MAKE=${MAKE-make}"
  fi
  
-# Check whether --enable-dependency-tracking or --disable-dependency-tracking was given.
-if test "${enable_dependency_tracking+set}" = set; then
-  enableval="$enable_dependency_tracking"
-  :
-fi
-
-if test "x$enable_dependency_tracking" = xno; then
-  AMDEP="#"
-else
-  am_depcomp="$ac_aux_dir/depcomp"
-  if test ! -f "$am_depcomp"; then
-    AMDEP="#"
-  else
-    AMDEP=
-  fi
-fi
-
-if test -z "$AMDEP"; then
-  AMDEPBACKSLASH='\'
-else
-  AMDEPBACKSLASH=
-fi
-
  
+PACKAGE=gromacs
  
+VERSION=3.0
  
-if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
-  DEPDIR=.deps
-  # We redirect because .deps might already exist and be populated.
-  # In this situation we don't want to see an error.
-  rmdir .deps > /dev/null 2>&1
-else
-  DEPDIR=_deps
-fi
-
-
-# test to see if srcdir already configured
-if test "`CDPATH=:; cd $srcdir && pwd`" != "`pwd`" &&
-   test -f $srcdir/config.status; then
-  { echo "configure: error: source directory already configured; run \"make distclean\" there first" 1>&2; exit 1; }
+if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then
+  { echo "configure: error: source directory already configured; run "make distclean" there first" 1>&2; exit 1; }
  fi
-
-# Define the identity of the package.
-PACKAGE=gromacs
-VERSION=3.0
  cat >> confdefs.h <<EOF
  #define PACKAGE "$PACKAGE"
  EOF
@@ -843,54 +830,78 @@ cat >> confdefs.h <<EOF
  EOF
  
  
-# Autoconf 2.50 wants to disallow AM_ names.  We explicitly allow
-# the ones we care about.
-
-
-
  
+missing_dir=`cd $ac_aux_dir && pwd`
+echo $ac_n "checking for working aclocal""... $ac_c" 1>&6
+echo "configure:837: checking for working aclocal" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf.  Sigh.
+if (aclocal --version) < /dev/null > /dev/null 2>&1; then
+   ACLOCAL=aclocal
+   echo "$ac_t""found" 1>&6
+else
+   ACLOCAL="$missing_dir/missing aclocal"
+   echo "$ac_t""missing" 1>&6
+fi
  
+echo $ac_n "checking for working autoconf""... $ac_c" 1>&6
+echo "configure:850: checking for working autoconf" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf.  Sigh.
+if (autoconf --version) < /dev/null > /dev/null 2>&1; then
+   AUTOCONF=autoconf
+   echo "$ac_t""found" 1>&6
+else
+   AUTOCONF="$missing_dir/missing autoconf"
+   echo "$ac_t""missing" 1>&6
+fi
  
+echo $ac_n "checking for working automake""... $ac_c" 1>&6
+echo "configure:863: checking for working automake" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf.  Sigh.
+if (automake --version) < /dev/null > /dev/null 2>&1; then
+   AUTOMAKE=automake
+   echo "$ac_t""found" 1>&6
+else
+   AUTOMAKE="$missing_dir/missing automake"
+   echo "$ac_t""missing" 1>&6
+fi
  
+echo $ac_n "checking for working autoheader""... $ac_c" 1>&6
+echo "configure:876: checking for working autoheader" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf.  Sigh.
+if (autoheader --version) < /dev/null > /dev/null 2>&1; then
+   AUTOHEADER=autoheader
+   echo "$ac_t""found" 1>&6
+else
+   AUTOHEADER="$missing_dir/missing autoheader"
+   echo "$ac_t""missing" 1>&6
+fi
  
-# Some tools Automake needs.
-
-ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal"}
-
-
-AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
-
-
-AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake"}
-
-
-AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
-
-
-MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
-
-
-AMTAR=${AMTAR-"${am_missing_run}tar"}
-
-
-if test -z "$install_sh"; then
-   for install_sh in "$ac_aux_dir/install-sh" \
-                     "$ac_aux_dir/install.sh" \
-                     "${am_missing_run}${ac_auxdir}/install-sh";
-   do
-     test -f "$install_sh" && break
-   done
-   # FIXME: an evil hack: we remove the SHELL invocation from
-   # install_sh because automake adds it back in.  Sigh.
-   install_sh=`echo $install_sh | sed -e 's/\${SHELL}//'`
+echo $ac_n "checking for working makeinfo""... $ac_c" 1>&6
+echo "configure:889: checking for working makeinfo" >&5
+# Run test in a subshell; some versions of sh will print an error if
+# an executable is not found, even if stderr is redirected.
+# Redirect stdin to placate older versions of autoconf.  Sigh.
+if (makeinfo --version) < /dev/null > /dev/null 2>&1; then
+   MAKEINFO=makeinfo
+   echo "$ac_t""found" 1>&6
+else
+   MAKEINFO="$missing_dir/missing makeinfo"
+   echo "$ac_t""missing" 1>&6
  fi
  
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+
+SHARED_VERSION_INFO="1:0:0"
  
  
  
-  
  
  
  #######################################################################
@@ -935,12 +946,12 @@ fi
  
  #####
  
-# Check whether --enable-double or --disable-double was given.
-if test "${enable_double+set}" = set; then
-  enableval="$enable_double"
-  enable_double=$enableval
+# Check whether --enable-float or --disable-float was given.
+if test "${enable_float+set}" = set; then
+  enableval="$enable_float"
+  enable_float=$enableval
  else
-  enable_double=no
+  enable_float=yes
  fi
  
  
@@ -968,7 +979,6 @@ else
  fi
  
  
-
  #####
  
  # Check whether --enable-simplewater or --disable-simplewater was given.
@@ -1006,24 +1016,12 @@ fi
  
  #####
  
-# Check whether --enable-sse or --disable-sse was given.
-if test "${enable_sse+set}" = set; then
-  enableval="$enable_sse"
-  enable_sse=$enableval
-else
-  enable_sse=yes
-fi
-
-
-
-#####
-
-# Check whether --enable-3dnow or --disable-3dnow was given.
-if test "${enable_3dnow+set}" = set; then
-  enableval="$enable_3dnow"
-  enable_3dnow=$enableval
+# Check whether --enable-x86_asm or --disable-x86_asm was given.
+if test "${enable_x86_asm+set}" = set; then
+  enableval="$enable_x86_asm"
+  enable_x86_asm=$enableval
  else
-  enable_3dnow=yes
+  enable_x86_asm=yes
  fi
  
  
@@ -1167,7 +1165,7 @@ else { echo "configure: error: can not run $ac_config_sub" 1>&2; exit 1; }
  fi
  
  echo $ac_n "checking host system type""... $ac_c" 1>&6
-echo "configure:1171: checking host system type" >&5
+echo "configure:1169: checking host system type" >&5
  
  host_alias=$host
  case "$host_alias" in
@@ -1200,7 +1198,7 @@ esac
  if test "$enable_fortran" = "check"; then
  case "${host_cpu}-${host_os}" in
  
-  sparc*-solaris* | alpha*-osf* | rs6000*-aix* | mips*-irix*)
+  sparc*-solaris* | alpha*-* | rs6000*-aix* | mips*-irix*)
      enable_fortran=yes 
      ;;
  
@@ -1284,8 +1282,10 @@ EOF
  esac
  
  if test "$enable_fortran" = "yes"; then
-  # vendor f77 before g77
-  for ac_prog in f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90
+  # vendor f77 before g77 - but special compiler list for alpha-linux
+  case "${host_cpu}-${host_os}" in
+    alpha*-linux*)
+      for ac_prog in fort f77 g77
  do
  # Extract the first word of "$ac_prog", so it can be a program name with args.
  set dummy $ac_prog; ac_word=$2
@@ -1319,13 +1319,51 @@ fi
  test -n "$F77" && break
  done
  
+      ;;
+    *)   
+      for ac_prog in f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90
+do
+# Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:1330: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_F77'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$F77"; then
+  ac_cv_prog_F77="$F77" # Let the user override the test.
+else
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_F77="$ac_prog"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+F77="$ac_cv_prog_F77"
+if test -n "$F77"; then
+  echo "$ac_t""$F77" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+test -n "$F77" && break
+done
+
+      ;;
+  esac
    if test -z "$F77"; then
    for ac_prog in g77 f77 f2c
  do
  # Extract the first word of "$ac_prog", so it can be a program name with args.
  set dummy $ac_prog; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1329: checking for $ac_word" >&5
+echo "configure:1367: checking for $ac_word" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_F77'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1358,7 +1396,7 @@ done
  fi
  
  echo $ac_n "checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:1362: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) works" >&5
+echo "configure:1400: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) works" >&5
  
  ac_ext=f
  ac_compile='${F77-f77} -c $FFLAGS conftest.$ac_ext 1>&5'
@@ -1371,7 +1409,7 @@ cat > conftest.$ac_ext << EOF
        end
  
  EOF
-if { (eval echo configure:1375: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:1413: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    ac_cv_prog_f77_works=yes
    # If we can't run a trivial program, we are probably using a cross compiler.
    if (./conftest; exit) 2>/dev/null; then
@@ -1397,12 +1435,12 @@ if test $ac_cv_prog_f77_works = no; then
    { echo "configure: error: installation or configuration problem: Fortran 77 compiler cannot create executables." 1>&2; exit 1; }
  fi
  echo $ac_n "checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:1401: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) is a cross-compiler" >&5
+echo "configure:1439: checking whether the Fortran 77 compiler ($F77 $FFLAGS $LDFLAGS) is a cross-compiler" >&5
  echo "$ac_t""$ac_cv_prog_f77_cross" 1>&6
  cross_compiling=$ac_cv_prog_f77_cross
  
  echo $ac_n "checking whether we are using GNU Fortran 77""... $ac_c" 1>&6
-echo "configure:1406: checking whether we are using GNU Fortran 77" >&5
+echo "configure:1444: checking whether we are using GNU Fortran 77" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_g77'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1411,7 +1449,7 @@ else
    yes
  #endif
  EOF
-if { ac_try='$F77 -E conftest.fpp'; { (eval echo configure:1415: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+if { ac_try='$F77 -E conftest.fpp'; { (eval echo configure:1453: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
    ac_cv_prog_g77=yes
  else
    ac_cv_prog_g77=no
@@ -1426,7 +1464,7 @@ if test $ac_cv_prog_g77 = yes; then
    ac_save_FFLAGS="$FFLAGS"
    FFLAGS=
    echo $ac_n "checking whether $F77 accepts -g""... $ac_c" 1>&6
-echo "configure:1430: checking whether $F77 accepts -g" >&5
+echo "configure:1468: checking whether $F77 accepts -g" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_f77_g'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1461,39 +1499,48 @@ fi
    fi
  fi
  
-# Checks for programs.
-echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6
-echo "configure:1467: checking whether ${MAKE-make} sets \${MAKE}" >&5
-set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'`
-if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then
+case "${host_cpu}-${host_os}" in
+    alpha*-linux*)
+      for ac_prog in ccc cc
+do
+# Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:1510: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
-  cat > conftestmake <<\EOF
-all:
-       @echo 'ac_maketemp="${MAKE}"'
-EOF
-# GNU make sometimes prints "make[1]: Entering...", which would confuse us.
-eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=`
-if test -n "$ac_maketemp"; then
-  eval ac_cv_prog_make_${ac_make}_set=yes
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
  else
-  eval ac_cv_prog_make_${ac_make}_set=no
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_CC="$ac_prog"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
  fi
-rm -f conftestmake
  fi
-if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then
-  echo "$ac_t""yes" 1>&6
-  SET_MAKE=
+CC="$ac_cv_prog_CC"
+if test -n "$CC"; then
+  echo "$ac_t""$CC" 1>&6
  else
    echo "$ac_t""no" 1>&6
-  SET_MAKE="MAKE=${MAKE-make}"
  fi
  
-
-# Extract the first word of "cc", so it can be a program name with args.
+test -n "$CC" && break
+done
+ # do vendor cc before gcc
+      ;;
+    *)   
+      # Extract the first word of "cc", so it can be a program name with args.
  set dummy cc; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1497: checking for $ac_word" >&5
+echo "configure:1544: checking for $ac_word" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1519,10 +1566,12 @@ else
    echo "$ac_t""no" 1>&6
  fi
   # do vendor cc before gcc
+      ;;
+esac
  # Extract the first word of "gcc", so it can be a program name with args.
  set dummy gcc; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1526: checking for $ac_word" >&5
+echo "configure:1575: checking for $ac_word" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1552,7 +1601,7 @@ if test -z "$CC"; then
    # Extract the first word of "cc", so it can be a program name with args.
  set dummy cc; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1556: checking for $ac_word" >&5
+echo "configure:1605: checking for $ac_word" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1603,7 +1652,7 @@ fi
        # Extract the first word of "cl", so it can be a program name with args.
  set dummy cl; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1607: checking for $ac_word" >&5
+echo "configure:1656: checking for $ac_word" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1635,7 +1684,7 @@ fi
  fi
  
  echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:1639: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
+echo "configure:1688: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
  
  ac_ext=c
  # CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
@@ -1646,12 +1695,12 @@ cross_compiling=$ac_cv_prog_cc_cross
  
  cat > conftest.$ac_ext << EOF
  
-#line 1650 "configure"
+#line 1699 "configure"
  #include "confdefs.h"
  
  main(){return(0);}
  EOF
-if { (eval echo configure:1655: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:1704: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    ac_cv_prog_cc_works=yes
    # If we can't run a trivial program, we are probably using a cross compiler.
    if (./conftest; exit) 2>/dev/null; then
@@ -1677,12 +1726,12 @@ if test $ac_cv_prog_cc_works = no; then
    { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; }
  fi
  echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:1681: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
+echo "configure:1730: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
  echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6
  cross_compiling=$ac_cv_prog_cc_cross
  
  echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6
-echo "configure:1686: checking whether we are using GNU C" >&5
+echo "configure:1735: checking whether we are using GNU C" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1691,7 +1740,7 @@ else
    yes;
  #endif
  EOF
-if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:1695: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:1744: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
    ac_cv_prog_gcc=yes
  else
    ac_cv_prog_gcc=no
@@ -1710,7 +1759,7 @@ ac_test_CFLAGS="${CFLAGS+set}"
  ac_save_CFLAGS="$CFLAGS"
  CFLAGS=
  echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6
-echo "configure:1714: checking whether ${CC-cc} accepts -g" >&5
+echo "configure:1763: checking whether ${CC-cc} accepts -g" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -1741,9 +1790,8 @@ else
    fi
  fi
  
-
  echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:1747: checking how to run the C preprocessor" >&5
+echo "configure:1795: checking how to run the C preprocessor" >&5
  # On Suns, sometimes $CPP names a directory.
  if test -n "$CPP" && test -d "$CPP"; then
    CPP=
@@ -1758,13 +1806,13 @@ else
    # On the NeXT, cc -E runs the code through the compiler's parser,
    # not just through cpp.
    cat > conftest.$ac_ext <<EOF
-#line 1762 "configure"
+#line 1810 "configure"
  #include "confdefs.h"
  #include <assert.h>
  Syntax Error
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:1768: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:1816: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
    :
@@ -1775,13 +1823,13 @@ else
    rm -rf conftest*
    CPP="${CC-cc} -E -traditional-cpp"
    cat > conftest.$ac_ext <<EOF
-#line 1779 "configure"
+#line 1827 "configure"
  #include "confdefs.h"
  #include <assert.h>
  Syntax Error
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:1785: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:1833: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
    :
@@ -1792,13 +1840,13 @@ else
    rm -rf conftest*
    CPP="${CC-cc} -nologo -E"
    cat > conftest.$ac_ext <<EOF
-#line 1796 "configure"
+#line 1844 "configure"
  #include "confdefs.h"
  #include <assert.h>
  Syntax Error
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:1802: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:1850: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
    :
@@ -1822,101 +1870,12 @@ else
  fi
  echo "$ac_t""$CPP" 1>&6
  
-am_make=${MAKE-make}
-# BSD make uses .include
-cat > confinc << 'END'
-doit:
-       @echo done
-END
-# If we don't find an include directive, just comment out the code.
-echo $ac_n "checking for style of include used by $am_make""... $ac_c" 1>&6
-echo "configure:1834: checking for style of include used by $am_make" >&5
-_am_include='#'
-for am_inc in include .include; do
-   echo "$am_inc confinc" > confmf
-   if test "`$am_make -f confmf 2> /dev/null`" = "done"; then
-      _am_include=$am_inc
-      break
-   fi
-done
-
-echo "$ac_t""$_am_include" 1>&6
-rm -f confinc confmf
-
-
-depcc="$CC"
-depcpp="$CPP"
-
-
-
-echo $ac_n "checking dependency style of $depcc""... $ac_c" 1>&6
-echo "configure:1854: checking dependency style of $depcc" >&5
-if eval "test \"`echo '$''{'am_cv_CC_dependencies_compiler_type'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  if test -z "$AMDEP"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  mkdir confdir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" confdir
-  cd confdir
-
-  am_cv_CC_dependencies_compiler_type=none
-  for depmode in `sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < "./depcomp"`; do
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    echo '#include "conftest.h"' > conftest.c
-    echo 'int i;' > conftest.h
-
-    case "$depmode" in
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-       continue
-      else
-       break
-      fi
-      ;;
-    none) break ;;
-    esac
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.
-    if depmode="$depmode" \
-       source=conftest.c object=conftest.o \
-       depfile=conftest.Po tmpdepfile=conftest.TPo \
-       $SHELL ./depcomp $depcc -c conftest.c -o conftest.o >/dev/null 2>&1 &&
-       grep conftest.h conftest.Po > /dev/null 2>&1; then
-      am_cv_CC_dependencies_compiler_type="$depmode"
-      break
-    fi
-  done
-
-  cd ..
-  rm -rf confdir
-else
-  am_cv_CC_dependencies_compiler_type=none
-fi
-
-fi
-
-echo "$ac_t""$am_cv_CC_dependencies_compiler_type" 1>&6
-CCDEPMODE="depmode=$am_cv_CC_dependencies_compiler_type"
-
-
  BUILD_CC=$CC
  
  
  if test "$enable_fortran" = "yes"; then
    echo $ac_n "checking for Fortran 77 libraries""... $ac_c" 1>&6
-echo "configure:1920: checking for Fortran 77 libraries" >&5
+echo "configure:1879: checking for Fortran 77 libraries" >&5
  
  
  if eval "test \"`echo '$''{'ac_cv_flibs'+set}'`\" = set"; then
@@ -2075,7 +2034,7 @@ echo "$ac_t""$FLIBS" 1>&6
  
  
  echo $ac_n "checking fortran name mangling""... $ac_c" 1>&6
-echo "configure:2079: checking fortran name mangling" >&5
+echo "configure:2038: checking fortran name mangling" >&5
  cat > mangle-func.f <<EOF
        subroutine foobar()
        return
@@ -2085,7 +2044,7 @@ cat > mangle-func.f <<EOF
        end
  EOF
  ac_try='$F77 -c $FFLAGS mangle-func.f 1>&5'
-if { (eval echo configure:2089: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; then
+if { (eval echo configure:2048: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; then
    ac_try=""
  else
    echo "configure: failed program was:" >&5
@@ -2106,14 +2065,14 @@ cross_compiling=$ac_cv_prog_cc_cross
  ac_save_LIBS="$LIBS"
  LIBS="mangle-func.o $FLIBS $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 2110 "configure"
+#line 2069 "configure"
  #include "confdefs.h"
  
  int main() {
  foobar();
  ; return 0; }
  EOF
-if { (eval echo configure:2117: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2076: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    ac_f77_mangle_type=lowercase
  else
@@ -2121,14 +2080,14 @@ else
    cat conftest.$ac_ext >&5
    rm -rf conftest*
    cat > conftest.$ac_ext <<EOF
-#line 2125 "configure"
+#line 2084 "configure"
  #include "confdefs.h"
  
  int main() {
  foobar_();
  ; return 0; }
  EOF
-if { (eval echo configure:2132: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2091: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    ac_f77_mangle_type=lowercase-underscore
  else
@@ -2136,14 +2095,14 @@ else
    cat conftest.$ac_ext >&5
    rm -rf conftest*
    cat > conftest.$ac_ext <<EOF
-#line 2140 "configure"
+#line 2099 "configure"
  #include "confdefs.h"
  
  int main() {
  FOOBAR();
  ; return 0; }
  EOF
-if { (eval echo configure:2147: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2106: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    ac_f77_mangle_type=uppercase
  else
@@ -2151,14 +2110,14 @@ else
    cat conftest.$ac_ext >&5
    rm -rf conftest*
    cat > conftest.$ac_ext <<EOF
-#line 2155 "configure"
+#line 2114 "configure"
  #include "confdefs.h"
  
  int main() {
  FOOBAR_();
  ; return 0; }
  EOF
-if { (eval echo configure:2162: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2121: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    ac_f77_mangle_type=uppercase-underscore
  else
@@ -2215,7 +2174,7 @@ EOF
  esac
  
  echo $ac_n "checking whether f77 functions with underscore get an extra underscore""... $ac_c" 1>&6
-echo "configure:2219: checking whether f77 functions with underscore get an extra underscore" >&5
+echo "configure:2178: checking whether f77 functions with underscore get an extra underscore" >&5
  
  
  ac_ext=c
@@ -2228,14 +2187,14 @@ cross_compiling=$ac_cv_prog_cc_cross
  ac_save_LIBS="$LIBS"
  LIBS="mangle-func.o $FLIBS $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 2232 "configure"
+#line 2191 "configure"
  #include "confdefs.h"
  
  int main() {
  $mangle_try();
  ; return 0; }
  EOF
-if { (eval echo configure:2239: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2198: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    ac_f77_mangle_underscore=yes;
               cat >> confdefs.h <<\EOF
@@ -2263,7 +2222,7 @@ echo "$ac_t""$ac_f77_mangle_underscore" 1>&6
  fi
  
  # if we are using mpi, also get an MPICC. We cannot set that in the PROG_CC macro
-# above, since the autoconf checks that the created file can be executed. This would
+# above, since autoconf checks that the created file can be executed. This would
  # fail on platforms where MPI executables can only be run through a batchqueue.
  
  if test "$enable_mpi" = "yes"; then
@@ -2272,7 +2231,7 @@ do
  # Extract the first word of "$ac_prog", so it can be a program name with args.
  set dummy $ac_prog; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2276: checking for $ac_word" >&5
+echo "configure:2235: checking for $ac_word" >&5
  if eval "test \"`echo '$''{'ac_cv_prog_MPICC'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -2305,16 +2264,16 @@ test -n "$MPICC" || MPICC="$CC"
  # now change the normal cc to the MPI one - see the comment above.
    CC=$MPICC
    echo $ac_n "checking whether the MPI cc command works""... $ac_c" 1>&6
-echo "configure:2309: checking whether the MPI cc command works" >&5 # be paranoid
+echo "configure:2268: checking whether the MPI cc command works" >&5 # be paranoid
    cat > conftest.$ac_ext <<EOF
-#line 2311 "configure"
+#line 2270 "configure"
  #include "confdefs.h"
  #include <mpi.h>
  int main() {
  int argc; char **argv; MPI_Init(&argc,&argv);
  ; return 0; }
  EOF
-if { (eval echo configure:2318: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2277: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  else
@@ -2345,352 +2304,1081 @@ else
  fi
  fi
  
-# Find a good install program.  We prefer a C program (faster),
-# so one script is as good as another.  But avoid the broken or
-# incompatible versions:
-# SysV /etc/install, /usr/sbin/install
-# SunOS /usr/etc/install
-# IRIX /sbin/install
-# AIX /bin/install
-# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
-# AFS /usr/afsws/bin/install, which mishandles nonexistent args
-# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
-# ./install, which can be erroneously created by make from ./install.sh.
-echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6
-echo "configure:2361: checking for a BSD compatible install" >&5
-if test -z "$INSTALL"; then
-if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-    IFS="${IFS=        }"; ac_save_IFS="$IFS"; IFS=":"
-  for ac_dir in $PATH; do
-    # Account for people who put trailing slashes in PATH elements.
-    case "$ac_dir/" in
-    /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
-    *)
-      # OSF1 and SCO ODT 3.0 have their own names for install.
-      # Don't use installbsd from OSF since it installs stuff as root
-      # by default.
-      for ac_prog in ginstall scoinst install; do
-        if test -f $ac_dir/$ac_prog; then
-         if test $ac_prog = install &&
-            grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
-           # AIX install.  It has an incompatible calling convention.
-           :
-         else
-           ac_cv_path_install="$ac_dir/$ac_prog -c"
-           break 2
-         fi
-       fi
-      done
-      ;;
-    esac
-  done
-  IFS="$ac_save_IFS"
-
-fi
-  if test "${ac_cv_path_install+set}" = set; then
-    INSTALL="$ac_cv_path_install"
-  else
-    # As a last resort, use the slow shell script.  We don't cache a
-    # path for INSTALL within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the path is relative.
-    INSTALL="$ac_install_sh"
+# A rather complicated check for the capabilities of as, to make
+# sure we can compile the assembly innerloops.
+if test "$x86" = "yes"; then
+  if  test "$enable_x86_asm" = "yes"; then
+    if test "$enable_float" = "no"; then
+      echo "configure: warning: The assembly loops can only be used in single precision - disabling" 1>&2
+      enable_x86_asm=no
+    else 
+      echo $ac_n "checking whether as fully supports intel syntax""... $ac_c" 1>&6
+echo "configure:2317: checking whether as fully supports intel syntax" >&5
+cat > conftest.s << EOF
+.intel_syntax noprefix 
+checkasm:
+       emms
+       pswapd mm0,mm0
+       movups xmm0,checkasm
+       emms
+       ret
+EOF
+      if { ac_try='$CC -c conftest.s'; { (eval echo configure:2327: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then
+       echo "$ac_t""yes" 1>&6
+      else
+        echo "$ac_t""no" 1>&6 
+       { echo "configure: error: Upgrade to binutils>=2.11, download the as executable     
+                    from www.gromacs.org, or disable assembly loops." 1>&2; exit 1; }
+      fi
+    fi 
    fi
+else # not x86
+enable_x86_asm=no
  fi
-echo "$ac_t""$INSTALL" 1>&6
-
-# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
-# It thinks the first close brace ends the variable substitution.
-test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
-
-test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}'
  
-test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
-
-echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:2414: checking how to run the C preprocessor" >&5
-# On Suns, sometimes $CPP names a directory.
-if test -n "$CPP" && test -d "$CPP"; then
-  CPP=
-fi
-if test -z "$CPP"; then
-if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then
+# Extract the first word of "ident", so it can be a program name with args.
+set dummy ident; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:2343: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_path_IDENT'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
-    # This must be in double quotes, not single quotes, because CPP may get
-  # substituted into the Makefile and "${CC-cc}" will confuse make.
-  CPP="${CC-cc} -E"
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp.
-  cat > conftest.$ac_ext <<EOF
-#line 2429 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2435: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
-if test -z "$ac_err"; then
-  :
+  case "$IDENT" in
+  /*)
+  ac_cv_path_IDENT="$IDENT" # Let the user override the test with a path.
+  ;;
+  ?:/*)                         
+  ac_cv_path_IDENT="$IDENT" # Let the user override the test with a dos path.
+  ;;
+  *)
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do 
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_path_IDENT="$ac_dir/$ac_word"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_path_IDENT" && ac_cv_path_IDENT="no"
+  ;;
+esac
+fi
+IDENT="$ac_cv_path_IDENT"
+if test -n "$IDENT"; then
+  echo "$ac_t""$IDENT" 1>&6
  else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  CPP="${CC-cc} -E -traditional-cpp"
+  echo "$ac_t""no" 1>&6
+fi
+
+if test "$IDENT" != "no"; then
+  # seems as if we have the ident program, but does the
+  # compiler support it?
+  echo $ac_n "checking whether the compiler supports ident""... $ac_c" 1>&6
+echo "configure:2380: checking whether the compiler supports ident" >&5        
    cat > conftest.$ac_ext <<EOF
-#line 2446 "configure"
+#line 2382 "configure"
  #include "confdefs.h"
-#include <assert.h>
-Syntax Error
+#ident  "@(#) file.h 1.1 12/16/92"
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2452: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:2387: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
-  :
-else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
    rm -rf conftest*
-  CPP="${CC-cc} -nologo -E"
-  cat > conftest.$ac_ext <<EOF
-#line 2463 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
+  
+    echo "$ac_t""yes" 1>&6 
+    cat >> confdefs.h <<\EOF
+#define HAVE_IDENT 
  EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2469: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
-if test -z "$ac_err"; then
-  :
+
  else
    echo "$ac_err" >&5
    echo "configure: failed program was:" >&5
    cat conftest.$ac_ext >&5
    rm -rf conftest*
-  CPP=/lib/cpp
+  echo "$ac_t""no" 1>&6
  fi
  rm -f conftest*
  fi
-rm -f conftest*
+
+
+echo $ac_n "checking whether ln -s works""... $ac_c" 1>&6
+echo "configure:2409: checking whether ln -s works" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_LN_S'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  rm -f conftestdata
+if ln -s X conftestdata 2>/dev/null
+then
+  rm -f conftestdata
+  ac_cv_prog_LN_S="ln -s"
+else
+  ac_cv_prog_LN_S=ln
+fi
+fi
+LN_S="$ac_cv_prog_LN_S"
+if test "$ac_cv_prog_LN_S" = "ln -s"; then
+  echo "$ac_t""yes" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+# Check whether --enable-shared or --disable-shared was given.
+if test "${enable_shared+set}" = set; then
+  enableval="$enable_shared"
+  p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_shared=yes
+fi
+
+# Check whether --enable-static or --disable-static was given.
+if test "${enable_static+set}" = set; then
+  enableval="$enable_static"
+  p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_static=yes
+fi
+
+# Check whether --enable-fast-install or --disable-fast-install was given.
+if test "${enable_fast_install+set}" = set; then
+  enableval="$enable_fast_install"
+  p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_fast_install=yes
+fi
+
+echo $ac_n "checking build system type""... $ac_c" 1>&6
+echo "configure:2499: checking build system type" >&5
+
+build_alias=$build
+case "$build_alias" in
+NONE)
+  case $nonopt in
+  NONE) build_alias=$host_alias ;;
+  *) build_alias=$nonopt ;;
+  esac ;;
+esac
+
+build=`${CONFIG_SHELL-/bin/sh} $ac_config_sub $build_alias`
+build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+echo "$ac_t""$build" 1>&6
+
+# Check whether --with-gnu-ld or --without-gnu-ld was given.
+if test "${with_gnu_ld+set}" = set; then
+  withval="$with_gnu_ld"
+  test "$withval" = no || with_gnu_ld=yes
+else
+  with_gnu_ld=no
+fi
+
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  echo $ac_n "checking for ld used by GCC""... $ac_c" 1>&6
+echo "configure:2528: checking for ld used by GCC" >&5
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [\\/]* | [A-Za-z]:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+       ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  echo $ac_n "checking for GNU ld""... $ac_c" 1>&6
+echo "configure:2558: checking for GNU ld" >&5
+else
+  echo $ac_n "checking for non-GNU ld""... $ac_c" 1>&6
+echo "configure:2561: checking for non-GNU ld" >&5
+fi
+if eval "test \"`echo '$''{'lt_cv_path_LD'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -z "$LD"; then
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+       test "$with_gnu_ld" != no && break
+      else
+       test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi
+fi
+
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  echo "$ac_t""$LD" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+test -z "$LD" && { echo "configure: error: no acceptable ld found in \$PATH" 1>&2; exit 1; }
+echo $ac_n "checking if the linker ($LD) is GNU ld""... $ac_c" 1>&6
+echo "configure:2596: checking if the linker ($LD) is GNU ld" >&5
+if eval "test \"`echo '$''{'lt_cv_prog_gnu_ld'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  # I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  lt_cv_prog_gnu_ld=yes
+else
+  lt_cv_prog_gnu_ld=no
+fi
+fi
+
+echo "$ac_t""$lt_cv_prog_gnu_ld" 1>&6
+with_gnu_ld=$lt_cv_prog_gnu_ld
+
+
+echo $ac_n "checking for $LD option to reload object files""... $ac_c" 1>&6
+echo "configure:2613: checking for $LD option to reload object files" >&5
+if eval "test \"`echo '$''{'lt_cv_ld_reload_flag'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  lt_cv_ld_reload_flag='-r'
+fi
+
+echo "$ac_t""$lt_cv_ld_reload_flag" 1>&6
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+
+echo $ac_n "checking for BSD-compatible nm""... $ac_c" 1>&6
+echo "configure:2625: checking for BSD-compatible nm" >&5
+if eval "test \"`echo '$''{'lt_cv_path_NM'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$NM"; then
+  # Let the user override the test.
+  lt_cv_path_NM="$NM"
+else
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    tmp_nm=$ac_dir/${ac_tool_prefix}nm
+    if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      # Tru64's nm complains that /dev/null is an invalid object file
+      if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+       lt_cv_path_NM="$tmp_nm -B"
+       break
+      elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+       lt_cv_path_NM="$tmp_nm -p"
+       break
+      else
+       lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+       continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi
+fi
+
+NM="$lt_cv_path_NM"
+echo "$ac_t""$NM" 1>&6
+
+echo $ac_n "checking how to recognise dependant libraries""... $ac_c" 1>&6
+echo "configure:2663: checking how to recognise dependant libraries" >&5
+if eval "test \"`echo '$''{'lt_cv_deplibs_check_method'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw* |pw32*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='$OBJDUMP -f'
+  ;;
+
+darwin* | rhapsody*)
+  lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  case "$host_os" in
+  rhapsody* | darwin1.012)
+    lt_cv_file_magic_test_file='/System/Library/Frameworks/System.framework/System'
+    ;;
+  *) # Darwin 1.3 on
+    lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+    ;;
+  esac
+  ;;
+
+freebsd* )
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    case $host_cpu in
+    i*86 )
+      # Not sure whether the presence of OpenBSD here was a mistake.
+      # Let's accept both of them until this is cleared up.
+      lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD)/i[3-9]86 (compact )?demand paged shared library'
+      lt_cv_file_magic_cmd=/usr/bin/file
+      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+      ;;
+    esac
+  else
+    lt_cv_deplibs_check_method=pass_all
+  fi
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+hpux10.20*|hpux11*)
+  lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9].[0-9]) shared library'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libc.sl
+  ;;
+
+irix5* | irix6*)
+  case $host_os in
+  irix5*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case $LD in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"
+    ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case $host_cpu in
+  alpha* | hppa* | i*86 | powerpc* | sparc* | ia64* )
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )' ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+netbsd*)
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so\.[0-9]+\.[0-9]+$'
+  else
+    lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so$'
+  fi
+  ;;
+
+newsos6)
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libnls.so
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+sysv5uw[78]* | sysv4*uw2*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case $host_vendor in
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  motorola)
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  esac
+  ;;
+esac
+
+fi
+
+echo "$ac_t""$lt_cv_deplibs_check_method" 1>&6
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+
+echo $ac_n "checking for object suffix""... $ac_c" 1>&6
+echo "configure:2827: checking for object suffix" >&5
+if eval "test \"`echo '$''{'ac_cv_objext'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  rm -f conftest*
+echo 'int i = 1;' > conftest.$ac_ext
+if { (eval echo configure:2833: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  for ac_file in conftest.*; do
+    case $ac_file in
+    *.c) ;;
+    *) ac_cv_objext=`echo $ac_file | sed -e s/conftest.//` ;;
+    esac
+  done
+else
+  { echo "configure: error: installation or configuration problem; compiler does not work" 1>&2; exit 1; }
  fi
  rm -f conftest*
-  ac_cv_prog_CPP="$CPP"
  fi
-  CPP="$ac_cv_prog_CPP"
+
+echo "$ac_t""$ac_cv_objext" 1>&6
+OBJEXT=$ac_cv_objext
+ac_objext=$ac_cv_objext
+
+
+
+echo $ac_n "checking for executable suffix""... $ac_c" 1>&6
+echo "configure:2853: checking for executable suffix" >&5
+if eval "test \"`echo '$''{'ac_cv_exeext'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
  else
-  ac_cv_prog_CPP="$CPP"
+  if test "$CYGWIN" = yes || test "$MINGW32" = yes; then
+  ac_cv_exeext=.exe
+else
+  rm -f conftest*
+  echo 'int main () { return 0; }' > conftest.$ac_ext
+  ac_cv_exeext=
+  if { (eval echo configure:2863: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; }; then
+    for file in conftest.*; do
+      case $file in
+      *.c | *.o | *.obj) ;;
+      *) ac_cv_exeext=`echo $file | sed -e s/conftest//` ;;
+      esac
+    done
+  else
+    { echo "configure: error: installation or configuration problem: compiler cannot create executables." 1>&2; exit 1; }
+  fi
+  rm -f conftest*
+  test x"${ac_cv_exeext}" = x && ac_cv_exeext=no
+fi
  fi
-echo "$ac_t""$CPP" 1>&6
  
+EXEEXT=""
+test x"${ac_cv_exeext}" != xno && EXEEXT=${ac_cv_exeext}
+echo "$ac_t""${ac_cv_exeext}" 1>&6
+ac_exeext=$EXEEXT
  
-# A rather complicated check for the nasm program and x86 assembly capabilities
-# to run under windows we must insert a test a change nasm "-f elf" to "-f win32"
-if test "$x86" = "yes"; then
-  if  test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"; then
-    if test "$enable_double" = "yes"; then
-      echo "configure: warning: SSE/3Dnow assembly can only be used in single precision" 1>&2
-      enable_sse=no
-      enable_3dnow=no
-    else 
-      # Extract the first word of "nasm", so it can be a program name with args.
-set dummy nasm; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2506: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_path_NASM'+set}'`\" = set"; then
+if test $host != $build; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+# Autoconf 2.13's AC_OBJEXT and AC_EXEEXT macros only works for C compilers!
+
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+    echo $ac_n "checking for ${ac_tool_prefix}file""... $ac_c" 1>&6
+echo "configure:2896: checking for ${ac_tool_prefix}file" >&5
+if eval "test \"`echo '$''{'lt_cv_path_MAGIC_CMD'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
-  case "$NASM" in
+  case $MAGIC_CMD in
    /*)
-  ac_cv_path_NASM="$NASM" # Let the user override the test with a path.
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
    ;;
-  ?:/*)                         
-  ac_cv_path_NASM="$NASM" # Let the user override the test with a dos path.
+  ?:/*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
    ;;
    *)
-  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
-  ac_dummy="$PATH"
-  for ac_dir in $ac_dummy; do 
+  ac_save_MAGIC_CMD="$MAGIC_CMD"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
      test -z "$ac_dir" && ac_dir=.
-    if test -f $ac_dir/$ac_word; then
-      ac_cv_path_NASM="$ac_dir/$ac_word"
+    if test -f $ac_dir/${ac_tool_prefix}file; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/${ac_tool_prefix}file"
+      if test -n "$file_magic_test_file"; then
+       case $deplibs_check_method in
+       "file_magic "*)
+         file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+         MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+         if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+           egrep "$file_magic_regex" > /dev/null; then
+           :
+         else
+           cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+         fi ;;
+       esac
+      fi
        break
      fi
    done
    IFS="$ac_save_ifs"
-  test -z "$ac_cv_path_NASM" && ac_cv_path_NASM="no"
+  MAGIC_CMD="$ac_save_MAGIC_CMD"
    ;;
  esac
  fi
-NASM="$ac_cv_path_NASM"
-if test -n "$NASM"; then
-  echo "$ac_t""$NASM" 1>&6
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  echo "$ac_t""$MAGIC_CMD" 1>&6
  else
    echo "$ac_t""no" 1>&6
  fi
  
-      NASMFLAGS="-f elf" 
-      
-      if test "$NASM" = "no"; then 
-         { echo "configure: error: Nasm is required for SSE and 3DNow loops." 1>&2; exit 1; }
-      fi
-      if test "$enable_sse" = "yes"; then
-        echo $ac_n "checking whether nasm supports SSE instructions""... $ac_c" 1>&6
-echo "configure:2546: checking whether nasm supports SSE instructions" >&5
-cat > conftest_sse.s << EOF
-       global checksse 
-checksse:
-       emms
-       xorps xmm0,xmm0
-       emms
-       ret
-EOF
-        if { ac_try='$NASM conftest_sse.s'; { (eval echo configure:2555: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then
-         echo "$ac_t""yes" 1>&6
-        else
-         echo "$ac_t""no" 1>&6 
-         { echo "configure: error: Download a patched nasm from the Gromacs homepage,     
-                       or disable SSE assembly." 1>&2; exit 1; }
-        fi
-      fi       
-      if test "$enable_3dnow" = "yes"; then
-        echo $ac_n "checking whether nasm supports extended 3DNow instructions""... $ac_c" 1>&6
-echo "configure:2565: checking whether nasm supports extended 3DNow instructions" >&5
-cat > conftest_3dnow.s << EOF
-       global check3dnow       
-check3dnow:    
-       femms
-       pswapd mm0,mm0
-       femms
-       ret
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+  if test -n "$ac_tool_prefix"; then
+    echo $ac_n "checking for file""... $ac_c" 1>&6
+echo "configure:2958: checking for file" >&5
+if eval "test \"`echo '$''{'lt_cv_path_MAGIC_CMD'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  case $MAGIC_CMD in
+  /*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC_CMD="$MAGIC_CMD"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/file; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/file"
+      if test -n "$file_magic_test_file"; then
+       case $deplibs_check_method in
+       "file_magic "*)
+         file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+         MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+         if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+           egrep "$file_magic_regex" > /dev/null; then
+           :
+         else
+           cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
  EOF
-        if { ac_try='$NASM -f elf conftest_3dnow.s'; { (eval echo configure:2574: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then
-         echo "$ac_t""yes" 1>&6
-        else
-         echo "$ac_t""no" 1>&6
-         { echo "configure: error: Download a patched nasm from the Gromacs homepage,
-                       or disable 3DNow assembly." 1>&2; exit 1; }
-        fi
+         fi ;;
+       esac
        fi
+      break
      fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC_CMD="$ac_save_MAGIC_CMD"
+  ;;
+esac
+fi
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  echo "$ac_t""$MAGIC_CMD" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+  else
+    MAGIC_CMD=:
    fi
-else # not x86
-enable_sse=no
-enable_3dnow=no
  fi
  
-# Extract the first word of "ident", so it can be a program name with args.
-set dummy ident; ac_word=$2
+  fi
+  ;;
+esac
+
+# Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3029: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ac_t""$RANLIB" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+if test -z "$ac_cv_prog_RANLIB"; then
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3061: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_RANLIB="ranlib"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ac_t""$RANLIB" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+else
+  RANLIB=":"
+fi
+fi
+
+# Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2592: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_path_IDENT'+set}'`\" = set"; then
+echo "configure:3096: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_STRIP'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
-  case "$IDENT" in
-  /*)
-  ac_cv_path_IDENT="$IDENT" # Let the user override the test with a path.
-  ;;
-  ?:/*)                         
-  ac_cv_path_IDENT="$IDENT" # Let the user override the test with a dos path.
-  ;;
-  *)
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
    IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
    ac_dummy="$PATH"
-  for ac_dir in $ac_dummy; do 
+  for ac_dir in $ac_dummy; do
      test -z "$ac_dir" && ac_dir=.
      if test -f $ac_dir/$ac_word; then
-      ac_cv_path_IDENT="$ac_dir/$ac_word"
+      ac_cv_prog_STRIP="${ac_tool_prefix}strip"
        break
      fi
    done
    IFS="$ac_save_ifs"
-  test -z "$ac_cv_path_IDENT" && ac_cv_path_IDENT="no"
-  ;;
-esac
  fi
-IDENT="$ac_cv_path_IDENT"
-if test -n "$IDENT"; then
-  echo "$ac_t""$IDENT" 1>&6
-else
-  echo "$ac_t""no" 1>&6
  fi
-
-if test "$IDENT" != "no"; then
-  # seems as if we have the ident program, but does the
-  # compiler support it?
-  echo $ac_n "checking whether the compiler supports ident""... $ac_c" 1>&6
-echo "configure:2629: checking whether the compiler supports ident" >&5        
-  cat > conftest.$ac_ext <<EOF
-#line 2631 "configure"
-#include "confdefs.h"
-#ident  "@(#) file.h 1.1 12/16/92"
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2636: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
-if test -z "$ac_err"; then
-  rm -rf conftest*
-  
-    echo "$ac_t""yes" 1>&6 
-    cat >> confdefs.h <<\EOF
-#define HAVE_IDENT 
-EOF
-
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ac_t""$STRIP" 1>&6
  else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
    echo "$ac_t""no" 1>&6
  fi
-rm -f conftest*
-fi
  
-# Extract the first word of "ranlib", so it can be a program name with args.
-set dummy ranlib; ac_word=$2
+
+if test -z "$ac_cv_prog_STRIP"; then
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
  echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2659: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+echo "configure:3128: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_STRIP'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
-  if test -n "$RANLIB"; then
-  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
  else
    IFS="${IFS=  }"; ac_save_ifs="$IFS"; IFS=":"
    ac_dummy="$PATH"
    for ac_dir in $ac_dummy; do
      test -z "$ac_dir" && ac_dir=.
      if test -f $ac_dir/$ac_word; then
-      ac_cv_prog_RANLIB="ranlib"
+      ac_cv_prog_STRIP="strip"
        break
      fi
    done
    IFS="$ac_save_ifs"
-  test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
+  test -z "$ac_cv_prog_STRIP" && ac_cv_prog_STRIP=":"
  fi
  fi
-RANLIB="$ac_cv_prog_RANLIB"
-if test -n "$RANLIB"; then
-  echo "$ac_t""$RANLIB" 1>&6
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ac_t""$STRIP" 1>&6
  else
    echo "$ac_t""no" 1>&6
  fi
  
+else
+  STRIP=":"
+fi
+fi
+
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$GCC" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$lt_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+
+
+# Check whether --enable-libtool-lock or --disable-libtool-lock was given.
+if test "${enable_libtool_lock+set}" = set; then
+  enableval="$enable_libtool_lock"
+  :
+fi
+
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+# Check whether --with-pic or --without-pic was given.
+if test "${with_pic+set}" = set; then
+  withval="$with_pic"
+  pic_mode="$withval"
+else
+  pic_mode=default
+fi
+
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '#line 3195 "configure"' > conftest.$ac_ext
+  if { (eval echo configure:3196: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+    case `/usr/bin/file conftest.$ac_objext` in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  echo $ac_n "checking whether the C compiler needs -belf""... $ac_c" 1>&6
+echo "configure:3217: checking whether the C compiler needs -belf" >&5
+if eval "test \"`echo '$''{'lt_cv_cc_needs_belf'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+     cat > conftest.$ac_ext <<EOF
+#line 3230 "configure"
+#include "confdefs.h"
+
+int main() {
+
+; return 0; }
+EOF
+if { (eval echo configure:3237: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=no
+fi
+rm -f conftest*
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+fi
+
+echo "$ac_t""$lt_cv_cc_needs_belf" 1>&6
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+
+esac
+
+
+# Save cache, so that ltconfig can load it
+cat > confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >> confcache
+if cmp -s $cache_file confcache; then
+  :
+else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache > $cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+
+# Actually configure libtool.  ac_aux_dir is where install-sh is found.
+AR="$AR" LTCC="$CC" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $host \
+|| { echo "configure: error: libtool configure failed" 1>&2; exit 1; }
+
+# Reload cache, that may have been modified by ltconfig
+if test -r "$cache_file"; then
+  echo "loading cache $cache_file"
+  . $cache_file
+else
+  echo "creating cache $cache_file"
+  > $cache_file
+fi
+
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh $ac_aux_dir/ltcf-c.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+
+
+LIBTOOL_DEPS=$LIBTOOL_DEPS" $ac_aux_dir/ltcf-f77.sh"
+lt_save_CC="$CC"
+lt_save_CFLAGS="$CFLAGS"
+AR="$AR" LTCC="$CC" CC="$F77" F77="$F77" CFLAGS="$FFLAGS" CPPFLAGS="" \
+MAGIC_CMD="$MAGIC_CMD" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" \
+file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig -o libtool $libtool_flags \
+--build="$build" --add-tag=F77 $ac_aux_dir/ltcf-f77.sh $host \
+|| { echo "configure: error: libtool tag configuration failed" 1>&2; exit 1; }
+CC="$lt_save_CC"
+CFLAGS="$lt_save_CFLAGS"
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+
+  
+  
+
+  
+        
+        
  for ac_func in strcasecmp
  do
  echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2689: checking for $ac_func" >&5
+echo "configure:3377: checking for $ac_func" >&5
  if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 2694 "configure"
+#line 3382 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char $ac_func(); below.  */
@@ -2713,7 +3401,7 @@ $ac_func();
  
  ; return 0; }
  EOF
-if { (eval echo configure:2717: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3405: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func_$ac_func=yes"
  else
@@ -2740,12 +3428,12 @@ done
  for ac_func in strdup
  do
  echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2744: checking for $ac_func" >&5
+echo "configure:3432: checking for $ac_func" >&5
  if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 2749 "configure"
+#line 3437 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char $ac_func(); below.  */
@@ -2768,7 +3456,7 @@ $ac_func();
  
  ; return 0; }
  EOF
-if { (eval echo configure:2772: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3460: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func_$ac_func=yes"
  else
@@ -2876,7 +3564,7 @@ fi
  # Checks for libraries.
  ############################################################################
  echo $ac_n "checking for sqrt in -lm""... $ac_c" 1>&6
-echo "configure:2880: checking for sqrt in -lm" >&5
+echo "configure:3568: checking for sqrt in -lm" >&5
  ac_lib_var=`echo m'_'sqrt | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -2884,7 +3572,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lm  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 2888 "configure"
+#line 3576 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -2895,7 +3583,7 @@ int main() {
  sqrt()
  ; return 0; }
  EOF
-if { (eval echo configure:2899: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3587: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -2930,7 +3618,7 @@ fi
  # libm in the link list, thus the test goes after m!
  if test "${host_vendor}" = "ibm"; then
    echo $ac_n "checking for main in -lxlopt""... $ac_c" 1>&6
-echo "configure:2934: checking for main in -lxlopt" >&5
+echo "configure:3622: checking for main in -lxlopt" >&5
  ac_lib_var=`echo xlopt'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -2938,14 +3626,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lxlopt  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 2942 "configure"
+#line 3630 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:2949: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3637: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -2973,7 +3661,7 @@ else
  fi
  
    echo $ac_n "checking for main in -lmass""... $ac_c" 1>&6
-echo "configure:2977: checking for main in -lmass" >&5
+echo "configure:3665: checking for main in -lmass" >&5
  ac_lib_var=`echo mass'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -2981,14 +3669,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lmass  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 2985 "configure"
+#line 3673 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:2992: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3680: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3019,7 +3707,7 @@ fi
    case "$gmxcpu" in
      power4*)
        echo $ac_n "checking for main in -lmassvp4""... $ac_c" 1>&6
-echo "configure:3023: checking for main in -lmassvp4" >&5
+echo "configure:3711: checking for main in -lmassvp4" >&5
  ac_lib_var=`echo massvp4'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3027,14 +3715,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lmassvp4  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3031 "configure"
+#line 3719 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3038: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3726: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3056,7 +3744,7 @@ fi
    ;;
      power3*)
        echo $ac_n "checking for main in -lmassvp3""... $ac_c" 1>&6
-echo "configure:3060: checking for main in -lmassvp3" >&5
+echo "configure:3748: checking for main in -lmassvp3" >&5
  ac_lib_var=`echo massvp3'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3064,14 +3752,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lmassvp3  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3068 "configure"
+#line 3756 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3075: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3763: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3093,7 +3781,7 @@ fi
    ;;
      power2*)
        echo $ac_n "checking for main in -lmassvp3""... $ac_c" 1>&6
-echo "configure:3097: checking for main in -lmassvp3" >&5
+echo "configure:3785: checking for main in -lmassvp3" >&5
  ac_lib_var=`echo massvp3'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3101,14 +3789,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lmassvp3  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3105 "configure"
+#line 3793 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3112: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3800: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3130,7 +3818,7 @@ fi
    ;;
      *)
        echo $ac_n "checking for main in -lmassv""... $ac_c" 1>&6
-echo "configure:3134: checking for main in -lmassv" >&5
+echo "configure:3822: checking for main in -lmassv" >&5
  ac_lib_var=`echo massv'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3138,14 +3826,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lmassv  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3142 "configure"
+#line 3830 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3149: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3837: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3174,10 +3862,10 @@ EOF
    fi
  fi 
  
-if test "$enable_double" = "yes"; then
-  precision=8
-else
+if test "$enable_float" = "yes"; then
    precision=4
+else
+  precision=8
  fi
  
  if test "$enable_mpi" = "yes"; then
@@ -3196,16 +3884,16 @@ usedprefix=""
  ok="no"
  # check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
  echo $ac_n "checking for fftw_mpi.h""... $ac_c" 1>&6
-echo "configure:3200: checking for fftw_mpi.h" >&5
+echo "configure:3888: checking for fftw_mpi.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 3202 "configure"
+#line 3890 "configure"
  #include "confdefs.h"
  #include <fftw_mpi.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3209: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3897: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=fftw_mpi 
@@ -3221,16 +3909,16 @@ rm -f conftest*
  
  
  if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
  cat > conftest.$ac_ext <<EOF
-#line 3227 "configure"
+#line 3915 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)]; 
  ; return 0; }
  EOF
-if { (eval echo configure:3234: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3922: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ok="yes"
  else
@@ -3247,16 +3935,16 @@ fftwname=fftw_mpi
  if test "$ok" != "yes"; then
    xfftwname=${fftwcheckprefix}${fftwname}
    echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3251: checking for $xfftwname.h" >&5
+echo "configure:3939: checking for $xfftwname.h" >&5
    cat > conftest.$ac_ext <<EOF
-#line 3253 "configure"
+#line 3941 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3260: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3948: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  else
@@ -3266,26 +3954,31 @@ else
    
  echo "$ac_t""no" 1>&6
  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                        
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  
  fi
  rm -f conftest*
  cat > conftest.$ac_ext <<EOF
-#line 3282 "configure"
+#line 3975 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
  ; return 0; }
  EOF
-if { (eval echo configure:3289: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:3982: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=$xfftwname 
@@ -3295,21 +3988,27 @@ else
    echo "configure: failed program was:" >&5
    cat conftest.$ac_ext >&5
    rm -rf conftest*
-  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+  
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                       
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  fi
  rm -f conftest*
  fi
  
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3313: checking for main in -l$fftwname" >&5
+echo "configure:4012: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3317,14 +4016,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3321 "configure"
+#line 4020 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3328: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4027: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3359,20 +4058,20 @@ else
  
  fftwname=${ac_fftw_savedprefix}fftw_mpi
  echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:3363: checking for $fftwname.h" >&5
+echo "configure:4062: checking for $fftwname.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 3365 "configure"
+#line 4064 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3372: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4071: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3376: checking for main in -l$fftwname" >&5
+echo "configure:4075: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3380,14 +4079,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3384 "configure"
+#line 4083 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3391: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4090: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3444,16 +4143,16 @@ usedprefix=""
  ok="no"
  # check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
  echo $ac_n "checking for rfftw_mpi.h""... $ac_c" 1>&6
-echo "configure:3448: checking for rfftw_mpi.h" >&5
+echo "configure:4147: checking for rfftw_mpi.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 3450 "configure"
+#line 4149 "configure"
  #include "confdefs.h"
  #include <rfftw_mpi.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3457: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4156: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=rfftw_mpi 
@@ -3469,16 +4168,16 @@ rm -f conftest*
  
  
  if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
  cat > conftest.$ac_ext <<EOF
-#line 3475 "configure"
+#line 4174 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)]; 
  ; return 0; }
  EOF
-if { (eval echo configure:3482: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4181: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ok="yes"
  else
@@ -3495,16 +4194,16 @@ fftwname=rfftw_mpi
  if test "$ok" != "yes"; then
    xfftwname=${fftwcheckprefix}${fftwname}
    echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3499: checking for $xfftwname.h" >&5
+echo "configure:4198: checking for $xfftwname.h" >&5
    cat > conftest.$ac_ext <<EOF
-#line 3501 "configure"
+#line 4200 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3508: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4207: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  else
@@ -3514,26 +4213,31 @@ else
    
  echo "$ac_t""no" 1>&6
  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                        
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  
  fi
  rm -f conftest*
  cat > conftest.$ac_ext <<EOF
-#line 3530 "configure"
+#line 4234 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
  ; return 0; }
  EOF
-if { (eval echo configure:3537: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4241: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=$xfftwname 
@@ -3543,21 +4247,27 @@ else
    echo "configure: failed program was:" >&5
    cat conftest.$ac_ext >&5
    rm -rf conftest*
-  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+  
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                       
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  fi
  rm -f conftest*
  fi
  
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3561: checking for main in -l$fftwname" >&5
+echo "configure:4271: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3565,14 +4275,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3569 "configure"
+#line 4279 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3576: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4286: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3607,20 +4317,20 @@ else
  
  fftwname=${ac_fftw_savedprefix}rfftw_mpi
  echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:3611: checking for $fftwname.h" >&5
+echo "configure:4321: checking for $fftwname.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 3613 "configure"
+#line 4323 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3620: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4330: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3624: checking for main in -l$fftwname" >&5
+echo "configure:4334: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3628,14 +4338,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3632 "configure"
+#line 4342 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3639: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4349: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3694,16 +4404,16 @@ usedprefix=""
  ok="no"
  # check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
  echo $ac_n "checking for fftw.h""... $ac_c" 1>&6
-echo "configure:3698: checking for fftw.h" >&5
+echo "configure:4408: checking for fftw.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 3700 "configure"
+#line 4410 "configure"
  #include "confdefs.h"
  #include <fftw.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3707: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4417: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=fftw 
@@ -3719,16 +4429,16 @@ rm -f conftest*
  
  
  if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
  cat > conftest.$ac_ext <<EOF
-#line 3725 "configure"
+#line 4435 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)]; 
  ; return 0; }
  EOF
-if { (eval echo configure:3732: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4442: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ok="yes"
  else
@@ -3745,16 +4455,16 @@ fftwname=fftw
  if test "$ok" != "yes"; then
    xfftwname=${fftwcheckprefix}${fftwname}
    echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3749: checking for $xfftwname.h" >&5
+echo "configure:4459: checking for $xfftwname.h" >&5
    cat > conftest.$ac_ext <<EOF
-#line 3751 "configure"
+#line 4461 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3758: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4468: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  else
@@ -3764,26 +4474,31 @@ else
    
  echo "$ac_t""no" 1>&6
  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                        
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  
  fi
  rm -f conftest*
  cat > conftest.$ac_ext <<EOF
-#line 3780 "configure"
+#line 4495 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
  ; return 0; }
  EOF
-if { (eval echo configure:3787: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4502: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=$xfftwname 
@@ -3793,21 +4508,27 @@ else
    echo "configure: failed program was:" >&5
    cat conftest.$ac_ext >&5
    rm -rf conftest*
-  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+  
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                       
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  fi
  rm -f conftest*
  fi
  
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3811: checking for main in -l$fftwname" >&5
+echo "configure:4532: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3815,14 +4536,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3819 "configure"
+#line 4540 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3826: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4547: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3857,20 +4578,20 @@ else
  
  fftwname=${ac_fftw_savedprefix}fftw
  echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:3861: checking for $fftwname.h" >&5
+echo "configure:4582: checking for $fftwname.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 3863 "configure"
+#line 4584 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3870: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4591: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:3874: checking for main in -l$fftwname" >&5
+echo "configure:4595: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -3878,14 +4599,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 3882 "configure"
+#line 4603 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:3889: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4610: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -3942,16 +4663,16 @@ usedprefix=""
  ok="no"
  # check header doesn't work, since we must use mpicc to get includes, not just /lib/cpp
  echo $ac_n "checking for rfftw.h""... $ac_c" 1>&6
-echo "configure:3946: checking for rfftw.h" >&5
+echo "configure:4667: checking for rfftw.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 3948 "configure"
+#line 4669 "configure"
  #include "confdefs.h"
  #include <rfftw.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:3955: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4676: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=rfftw 
@@ -3967,16 +4688,16 @@ rm -f conftest*
  
  
  if test -n "$fftwname"; then
-# we cannot run the code since MPI program might not be allowed outside a charge queue
+# we cannot run the code since an MPI program might not be allowed on a login node of a supercomputer
  cat > conftest.$ac_ext <<EOF
-#line 3973 "configure"
+#line 4694 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)]; 
  ; return 0; }
  EOF
-if { (eval echo configure:3980: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4701: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ok="yes"
  else
@@ -3993,16 +4714,16 @@ fftwname=rfftw
  if test "$ok" != "yes"; then
    xfftwname=${fftwcheckprefix}${fftwname}
    echo $ac_n "checking for $xfftwname.h""... $ac_c" 1>&6
-echo "configure:3997: checking for $xfftwname.h" >&5
+echo "configure:4718: checking for $xfftwname.h" >&5
    cat > conftest.$ac_ext <<EOF
-#line 3999 "configure"
+#line 4720 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:4006: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4727: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  else
@@ -4012,26 +4733,31 @@ else
    
  echo "$ac_t""no" 1>&6
  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                        
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  
  fi
  rm -f conftest*
  cat > conftest.$ac_ext <<EOF
-#line 4028 "configure"
+#line 4754 "configure"
  #include "confdefs.h"
  #include <$xfftwname.h>
  int main() {
  int _array_ [1 - 2 * !((sizeof(fftw_real)) == $precision)];
  ; return 0; }
  EOF
-if { (eval echo configure:4035: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4761: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  fftwname=$xfftwname 
@@ -4041,21 +4767,27 @@ else
    echo "configure: failed program was:" >&5
    cat conftest.$ac_ext >&5
    rm -rf conftest*
-  { echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
-Do you have $prec precision FFTW installed? You can find it at www.fftw.org 
-Note that the default FFTW setup is double precision. You change the
-FFTW configuration to single with --enable-float and turn on MPI support
-with --enable-mpi. It is a good idea to install both single & double. 
+  
+{ echo "configure: error: Cannot find any $prec precision $fftwname.h or $xfftwname.h
+Do you have $prec precision FFTW installed? If you are using packages,
+note that you also need fftw-devel to compile GROMACS. You can find the 
+software at www.fftw.org, and detailed instructions at www.gromacs.org.
+If you compiled FFTW yourself:                                       
+Note that the default FFTW setup is double precision. Change the FFTW
+configuration to single with --enable-float. If you want MPI support,
+use --enable-mpi. It is a good idea to install both single & double. 
  If your sysadm doesn't want to install it you can do it to a location
-in your home directory and provide Gromacs configure with the correct
-paths by setting the CPPFLAGS and LDFLAGS environment variables.
-Check the Gromacs INSTALL file for additional information." 1>&2; exit 1; }
+in your home directory and provide the correct paths in the CPPFLAGS
+and LDFLAGS environment variables before running configure.
+That is also necessary to do if your compiler doesn't search
+/usr/local/include and /usr/local/lib by default.
+You can find information at www.gromacs.org, or in the INSTALL file." 1>&2; exit 1; }
  fi
  rm -f conftest*
  fi
  
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:4059: checking for main in -l$fftwname" >&5
+echo "configure:4791: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4063,14 +4795,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4067 "configure"
+#line 4799 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:4074: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4806: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -4105,20 +4837,20 @@ else
  
  fftwname=${ac_fftw_savedprefix}rfftw
  echo $ac_n "checking for $fftwname.h""... $ac_c" 1>&6
-echo "configure:4109: checking for $fftwname.h" >&5
+echo "configure:4841: checking for $fftwname.h" >&5
  cat > conftest.$ac_ext <<EOF
-#line 4111 "configure"
+#line 4843 "configure"
  #include "confdefs.h"
  #include <$fftwname.h>
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:4118: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:4850: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    echo "$ac_t""yes" 1>&6
  echo $ac_n "checking for main in -l$fftwname""... $ac_c" 1>&6
-echo "configure:4122: checking for main in -l$fftwname" >&5
+echo "configure:4854: checking for main in -l$fftwname" >&5
  ac_lib_var=`echo $fftwname'_'main | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4126,14 +4858,14 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-l$fftwname  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4130 "configure"
+#line 4862 "configure"
  #include "confdefs.h"
  
  int main() {
  main()
  ; return 0; }
  EOF
-if { (eval echo configure:4137: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:4869: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -4199,26 +4931,26 @@ esac
  
  ######
  if test "$enable_xdr" = "no"; then
-  echo "configure: warning: * Not using XDR cripples Gromacs significantly. You won't be able to *
-              * read or write any hardware independent or compressed trajectories. *
-              * We strongly suggest you try to locate the RPC routines instead!    *" 1>&2
+  echo "configure: warning: * Not using XDR cripples GROMACS significantly. You won't be able to *
+              * read or write any compressed trajectories. You have no choice on   *
+              * windows, but if you run UNIX locate the RPC files - you have them! *" 1>&2
  else
  # check for xtc headers
    for ac_hdr in rpc/rpc.h rpc/xdr.h
  do
  ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
  echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
-echo "configure:4212: checking for $ac_hdr" >&5
+echo "configure:4944: checking for $ac_hdr" >&5
  if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 4217 "configure"
+#line 4949 "configure"
  #include "confdefs.h"
  #include <$ac_hdr>
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:4222: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:4954: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
    rm -rf conftest*
@@ -4248,7 +4980,7 @@ done
  # check for xtc libs
  # on solaris the xdr stuff is in -lnsl
    echo $ac_n "checking for xdr_float in -lnsl""... $ac_c" 1>&6
-echo "configure:4252: checking for xdr_float in -lnsl" >&5
+echo "configure:4984: checking for xdr_float in -lnsl" >&5
  ac_lib_var=`echo nsl'_'xdr_float | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4256,7 +4988,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lnsl  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4260 "configure"
+#line 4992 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -4267,7 +4999,7 @@ int main() {
  xdr_float()
  ; return 0; }
  EOF
-if { (eval echo configure:4271: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5003: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -4295,7 +5027,7 @@ else
  fi
  
    cat > conftest.$ac_ext <<EOF
-#line 4299 "configure"
+#line 5031 "configure"
  #include "confdefs.h"
  #include<rpc/rpc.h> 
   #include<rpc/xdr.h>
@@ -4303,7 +5035,7 @@ int main() {
   XDR *xd; float f; xdr_float(xd,&f);
  ; return 0; }
  EOF
-if { (eval echo configure:4307: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5039: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    :
  else
    echo "configure: failed program was:" >&5
@@ -4330,7 +5062,7 @@ fi
  # Uses ac_ vars as temps to allow command line to override cache and checks.
  # --without-x overrides everything else, but does not touch the cache.
  echo $ac_n "checking for X""... $ac_c" 1>&6
-echo "configure:4334: checking for X" >&5
+echo "configure:5066: checking for X" >&5
  
  # Check whether --with-x or --without-x was given.
  if test "${with_x+set}" = set; then
@@ -4392,12 +5124,12 @@ if test "$ac_x_includes" = NO; then
  
    # First, try using that file with no special directory specified.
  cat > conftest.$ac_ext <<EOF
-#line 4396 "configure"
+#line 5128 "configure"
  #include "confdefs.h"
  #include <$x_direct_test_include>
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:4401: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:5133: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
    rm -rf conftest*
@@ -4466,14 +5198,14 @@ if test "$ac_x_libraries" = NO; then
    ac_save_LIBS="$LIBS"
    LIBS="-l$x_direct_test_library $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4470 "configure"
+#line 5202 "configure"
  #include "confdefs.h"
  
  int main() {
  ${x_direct_test_function}()
  ; return 0; }
  EOF
-if { (eval echo configure:4477: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5209: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    LIBS="$ac_save_LIBS"
  # We can link X programs with no special library path.
@@ -4579,17 +5311,17 @@ else
      case "`(uname -sr) 2>/dev/null`" in
      "SunOS 5"*)
        echo $ac_n "checking whether -R must be followed by a space""... $ac_c" 1>&6
-echo "configure:4583: checking whether -R must be followed by a space" >&5
+echo "configure:5315: checking whether -R must be followed by a space" >&5
        ac_xsave_LIBS="$LIBS"; LIBS="$LIBS -R$x_libraries"
        cat > conftest.$ac_ext <<EOF
-#line 4586 "configure"
+#line 5318 "configure"
  #include "confdefs.h"
  
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:4593: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5325: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    ac_R_nospace=yes
  else
@@ -4605,14 +5337,14 @@ rm -f conftest*
        else
         LIBS="$ac_xsave_LIBS -R $x_libraries"
         cat > conftest.$ac_ext <<EOF
-#line 4609 "configure"
+#line 5341 "configure"
  #include "confdefs.h"
  
  int main() {
  
  ; return 0; }
  EOF
-if { (eval echo configure:4616: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5348: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    ac_R_space=yes
  else
@@ -4644,7 +5376,7 @@ rm -f conftest*
      # libraries were built with DECnet support.  And karl@cs.umb.edu says
      # the Alpha needs dnet_stub (dnet does not exist).
      echo $ac_n "checking for dnet_ntoa in -ldnet""... $ac_c" 1>&6
-echo "configure:4648: checking for dnet_ntoa in -ldnet" >&5
+echo "configure:5380: checking for dnet_ntoa in -ldnet" >&5
  ac_lib_var=`echo dnet'_'dnet_ntoa | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4652,7 +5384,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-ldnet  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4656 "configure"
+#line 5388 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -4663,7 +5395,7 @@ int main() {
  dnet_ntoa()
  ; return 0; }
  EOF
-if { (eval echo configure:4667: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5399: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -4685,7 +5417,7 @@ fi
  
      if test $ac_cv_lib_dnet_dnet_ntoa = no; then
        echo $ac_n "checking for dnet_ntoa in -ldnet_stub""... $ac_c" 1>&6
-echo "configure:4689: checking for dnet_ntoa in -ldnet_stub" >&5
+echo "configure:5421: checking for dnet_ntoa in -ldnet_stub" >&5
  ac_lib_var=`echo dnet_stub'_'dnet_ntoa | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4693,7 +5425,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-ldnet_stub  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4697 "configure"
+#line 5429 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -4704,7 +5436,7 @@ int main() {
  dnet_ntoa()
  ; return 0; }
  EOF
-if { (eval echo configure:4708: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5440: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -4733,12 +5465,12 @@ fi
      # The nsl library prevents programs from opening the X display
      # on Irix 5.2, according to dickey@clark.net.
      echo $ac_n "checking for gethostbyname""... $ac_c" 1>&6
-echo "configure:4737: checking for gethostbyname" >&5
+echo "configure:5469: checking for gethostbyname" >&5
  if eval "test \"`echo '$''{'ac_cv_func_gethostbyname'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 4742 "configure"
+#line 5474 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char gethostbyname(); below.  */
@@ -4761,7 +5493,7 @@ gethostbyname();
  
  ; return 0; }
  EOF
-if { (eval echo configure:4765: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5497: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func_gethostbyname=yes"
  else
@@ -4782,7 +5514,7 @@ fi
  
      if test $ac_cv_func_gethostbyname = no; then
        echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6
-echo "configure:4786: checking for gethostbyname in -lnsl" >&5
+echo "configure:5518: checking for gethostbyname in -lnsl" >&5
  ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4790,7 +5522,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lnsl  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4794 "configure"
+#line 5526 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -4801,7 +5533,7 @@ int main() {
  gethostbyname()
  ; return 0; }
  EOF
-if { (eval echo configure:4805: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5537: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -4831,12 +5563,12 @@ fi
      # -lsocket must be given before -lnsl if both are needed.
      # We assume that if connect needs -lnsl, so does gethostbyname.
      echo $ac_n "checking for connect""... $ac_c" 1>&6
-echo "configure:4835: checking for connect" >&5
+echo "configure:5567: checking for connect" >&5
  if eval "test \"`echo '$''{'ac_cv_func_connect'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 4840 "configure"
+#line 5572 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char connect(); below.  */
@@ -4859,7 +5591,7 @@ connect();
  
  ; return 0; }
  EOF
-if { (eval echo configure:4863: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5595: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func_connect=yes"
  else
@@ -4880,7 +5612,7 @@ fi
  
      if test $ac_cv_func_connect = no; then
        echo $ac_n "checking for connect in -lsocket""... $ac_c" 1>&6
-echo "configure:4884: checking for connect in -lsocket" >&5
+echo "configure:5616: checking for connect in -lsocket" >&5
  ac_lib_var=`echo socket'_'connect | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4888,7 +5620,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lsocket $X_EXTRA_LIBS $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4892 "configure"
+#line 5624 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -4899,7 +5631,7 @@ int main() {
  connect()
  ; return 0; }
  EOF
-if { (eval echo configure:4903: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5635: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -4923,12 +5655,12 @@ fi
  
      # gomez@mi.uni-erlangen.de says -lposix is necessary on A/UX.
      echo $ac_n "checking for remove""... $ac_c" 1>&6
-echo "configure:4927: checking for remove" >&5
+echo "configure:5659: checking for remove" >&5
  if eval "test \"`echo '$''{'ac_cv_func_remove'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 4932 "configure"
+#line 5664 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char remove(); below.  */
@@ -4951,7 +5683,7 @@ remove();
  
  ; return 0; }
  EOF
-if { (eval echo configure:4955: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5687: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func_remove=yes"
  else
@@ -4972,7 +5704,7 @@ fi
  
      if test $ac_cv_func_remove = no; then
        echo $ac_n "checking for remove in -lposix""... $ac_c" 1>&6
-echo "configure:4976: checking for remove in -lposix" >&5
+echo "configure:5708: checking for remove in -lposix" >&5
  ac_lib_var=`echo posix'_'remove | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -4980,7 +5712,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lposix  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 4984 "configure"
+#line 5716 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -4991,7 +5723,7 @@ int main() {
  remove()
  ; return 0; }
  EOF
-if { (eval echo configure:4995: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5727: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -5015,12 +5747,12 @@ fi
  
      # BSDI BSD/OS 2.1 needs -lipc for XOpenDisplay.
      echo $ac_n "checking for shmat""... $ac_c" 1>&6
-echo "configure:5019: checking for shmat" >&5
+echo "configure:5751: checking for shmat" >&5
  if eval "test \"`echo '$''{'ac_cv_func_shmat'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5024 "configure"
+#line 5756 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char shmat(); below.  */
@@ -5043,7 +5775,7 @@ shmat();
  
  ; return 0; }
  EOF
-if { (eval echo configure:5047: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5779: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func_shmat=yes"
  else
@@ -5064,7 +5796,7 @@ fi
  
      if test $ac_cv_func_shmat = no; then
        echo $ac_n "checking for shmat in -lipc""... $ac_c" 1>&6
-echo "configure:5068: checking for shmat in -lipc" >&5
+echo "configure:5800: checking for shmat in -lipc" >&5
  ac_lib_var=`echo ipc'_'shmat | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -5072,7 +5804,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lipc  $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 5076 "configure"
+#line 5808 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -5083,7 +5815,7 @@ int main() {
  shmat()
  ; return 0; }
  EOF
-if { (eval echo configure:5087: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5819: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -5116,7 +5848,7 @@ fi
    # libraries we check for below, so use a different variable.
    #  --interran@uluru.Stanford.EDU, kb@cs.umb.edu.
    echo $ac_n "checking for IceConnectionNumber in -lICE""... $ac_c" 1>&6
-echo "configure:5120: checking for IceConnectionNumber in -lICE" >&5
+echo "configure:5852: checking for IceConnectionNumber in -lICE" >&5
  ac_lib_var=`echo ICE'_'IceConnectionNumber | sed 'y%./+-%__p_%'`
  if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
@@ -5124,7 +5856,7 @@ else
    ac_save_LIBS="$LIBS"
  LIBS="-lICE $X_EXTRA_LIBS $LIBS"
  cat > conftest.$ac_ext <<EOF
-#line 5128 "configure"
+#line 5860 "configure"
  #include "confdefs.h"
  /* Override any gcc2 internal prototype to avoid an error.  */
  /* We use char because int might match the return type of a gcc2
@@ -5135,7 +5867,7 @@ int main() {
  IceConnectionNumber()
  ; return 0; }
  EOF
-if { (eval echo configure:5139: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:5871: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_lib_$ac_lib_var=yes"
  else
@@ -5189,7 +5921,7 @@ fi
  
  
  echo $ac_n "checking for Motif""... $ac_c" 1>&6
-echo "configure:5193: checking for Motif" >&5
+echo "configure:5925: checking for Motif" >&5
  
  
  #
@@ -5213,14 +5945,14 @@ LDFLAGS="$X_LIBS $LDFLAGS"
  #
  ac_cv_motif_includes="none"
  cat > conftest.$ac_ext <<EOF
-#line 5217 "configure"
+#line 5949 "configure"
  #include "confdefs.h"
  #include <Xm/Xm.h>
  int main() {
  int a;
  ; return 0; }
  EOF
-if { (eval echo configure:5224: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:5956: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    
  # Xm/Xm.h is in the standard search path.
@@ -5285,14 +6017,14 @@ LDFLAGS="$X_LIBS $LDFLAGS"
  #
  ac_cv_motif_libraries="none"
  cat > conftest.$ac_ext <<EOF
-#line 5289 "configure"
+#line 6021 "configure"
  #include "confdefs.h"
  #include <Xm/Xm.h>
  int main() {
  XtToolkitInitialize();
  ; return 0; }
  EOF
-if { (eval echo configure:5296: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:6028: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    
  # libXm.a is in the standard search path.
@@ -5399,17 +6131,17 @@ for ac_hdr in limits.h malloc.h strings.h unistd.h
  do
  ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
  echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
-echo "configure:5403: checking for $ac_hdr" >&5
+echo "configure:6135: checking for $ac_hdr" >&5
  if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5408 "configure"
+#line 6140 "configure"
  #include "confdefs.h"
  #include <$ac_hdr>
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:5413: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:6145: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
    rm -rf conftest*
@@ -5440,12 +6172,12 @@ done
  #####
  # Checks for typedefs, structures, and compiler characteristics.
  echo $ac_n "checking for working const""... $ac_c" 1>&6
-echo "configure:5444: checking for working const" >&5
+echo "configure:6176: checking for working const" >&5
  if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5449 "configure"
+#line 6181 "configure"
  #include "confdefs.h"
  
  int main() {
@@ -5494,7 +6226,7 @@ ccp = (char const *const *) p;
  
  ; return 0; }
  EOF
-if { (eval echo configure:5498: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6230: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ac_cv_c_const=yes
  else
@@ -5515,12 +6247,12 @@ EOF
  fi
  
  echo $ac_n "checking for ANSI C header files""... $ac_c" 1>&6
-echo "configure:5519: checking for ANSI C header files" >&5
+echo "configure:6251: checking for ANSI C header files" >&5
  if eval "test \"`echo '$''{'ac_cv_header_stdc'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5524 "configure"
+#line 6256 "configure"
  #include "confdefs.h"
  #include <stdlib.h>
  #include <stdarg.h>
@@ -5528,7 +6260,7 @@ else
  #include <float.h>
  EOF
  ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:5532: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:6264: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
  ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
  if test -z "$ac_err"; then
    rm -rf conftest*
@@ -5545,7 +6277,7 @@ rm -f conftest*
  if test $ac_cv_header_stdc = yes; then
    # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
  cat > conftest.$ac_ext <<EOF
-#line 5549 "configure"
+#line 6281 "configure"
  #include "confdefs.h"
  #include <string.h>
  EOF
@@ -5563,7 +6295,7 @@ fi
  if test $ac_cv_header_stdc = yes; then
    # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
  cat > conftest.$ac_ext <<EOF
-#line 5567 "configure"
+#line 6299 "configure"
  #include "confdefs.h"
  #include <stdlib.h>
  EOF
@@ -5584,7 +6316,7 @@ if test "$cross_compiling" = yes; then
    :
  else
    cat > conftest.$ac_ext <<EOF
-#line 5588 "configure"
+#line 6320 "configure"
  #include "confdefs.h"
  #include <ctype.h>
  #define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
@@ -5595,7 +6327,7 @@ if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2);
  exit (0); }
  
  EOF
-if { (eval echo configure:5599: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:6331: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
  then
    :
  else
@@ -5619,12 +6351,12 @@ EOF
  fi
  
  echo $ac_n "checking for size_t""... $ac_c" 1>&6
-echo "configure:5623: checking for size_t" >&5
+echo "configure:6355: checking for size_t" >&5
  if eval "test \"`echo '$''{'ac_cv_type_size_t'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5628 "configure"
+#line 6360 "configure"
  #include "confdefs.h"
  #include <sys/types.h>
  #if STDC_HEADERS
@@ -5652,12 +6384,12 @@ EOF
  fi
  
  echo $ac_n "checking whether struct tm is in sys/time.h or time.h""... $ac_c" 1>&6
-echo "configure:5656: checking whether struct tm is in sys/time.h or time.h" >&5
+echo "configure:6388: checking whether struct tm is in sys/time.h or time.h" >&5
  if eval "test \"`echo '$''{'ac_cv_struct_tm'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5661 "configure"
+#line 6393 "configure"
  #include "confdefs.h"
  #include <sys/types.h>
  #include <time.h>
@@ -5665,7 +6397,7 @@ int main() {
  struct tm *tp; tp->tm_sec;
  ; return 0; }
  EOF
-if { (eval echo configure:5669: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6401: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ac_cv_struct_tm=time.h
  else
@@ -5686,12 +6418,12 @@ EOF
  fi
  
  echo $ac_n "checking for uid_t in sys/types.h""... $ac_c" 1>&6
-echo "configure:5690: checking for uid_t in sys/types.h" >&5
+echo "configure:6422: checking for uid_t in sys/types.h" >&5
  if eval "test \"`echo '$''{'ac_cv_type_uid_t'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5695 "configure"
+#line 6427 "configure"
  #include "confdefs.h"
  #include <sys/types.h>
  EOF
@@ -5720,21 +6452,21 @@ EOF
  fi
  
  echo $ac_n "checking for inline""... $ac_c" 1>&6
-echo "configure:5724: checking for inline" >&5
+echo "configure:6456: checking for inline" >&5
  if eval "test \"`echo '$''{'ac_cv_c_inline'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    ac_cv_c_inline=no
  for ac_kw in inline __inline__ __inline; do
    cat > conftest.$ac_ext <<EOF
-#line 5731 "configure"
+#line 6463 "configure"
  #include "confdefs.h"
  
  int main() {
  } $ac_kw foo() {
  ; return 0; }
  EOF
-if { (eval echo configure:5738: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6470: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ac_cv_c_inline=$ac_kw; break
  else
@@ -5764,7 +6496,7 @@ esac
  # Checks for library functions.
  #AC_FUNC_MALLOC
  echo $ac_n "checking for 8-bit clean memcmp""... $ac_c" 1>&6
-echo "configure:5768: checking for 8-bit clean memcmp" >&5
+echo "configure:6500: checking for 8-bit clean memcmp" >&5
  if eval "test \"`echo '$''{'ac_cv_func_memcmp_clean'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -5772,7 +6504,7 @@ else
    ac_cv_func_memcmp_clean=no
  else
    cat > conftest.$ac_ext <<EOF
-#line 5776 "configure"
+#line 6508 "configure"
  #include "confdefs.h"
  
  main()
@@ -5782,7 +6514,7 @@ main()
  }
  
  EOF
-if { (eval echo configure:5786: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:6518: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
  then
    ac_cv_func_memcmp_clean=yes
  else
@@ -5800,12 +6532,12 @@ echo "$ac_t""$ac_cv_func_memcmp_clean" 1>&6
  test $ac_cv_func_memcmp_clean = no && LIBOBJS="$LIBOBJS memcmp.${ac_objext}"
  
  echo $ac_n "checking return type of signal handlers""... $ac_c" 1>&6
-echo "configure:5804: checking return type of signal handlers" >&5
+echo "configure:6536: checking return type of signal handlers" >&5
  if eval "test \"`echo '$''{'ac_cv_type_signal'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5809 "configure"
+#line 6541 "configure"
  #include "confdefs.h"
  #include <sys/types.h>
  #include <signal.h>
@@ -5822,7 +6554,7 @@ int main() {
  int i;
  ; return 0; }
  EOF
-if { (eval echo configure:5826: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:6558: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
    rm -rf conftest*
    ac_cv_type_signal=void
  else
@@ -5841,12 +6573,12 @@ EOF
  
  
  echo $ac_n "checking for vprintf""... $ac_c" 1>&6
-echo "configure:5845: checking for vprintf" >&5
+echo "configure:6577: checking for vprintf" >&5
  if eval "test \"`echo '$''{'ac_cv_func_vprintf'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5850 "configure"
+#line 6582 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char vprintf(); below.  */
@@ -5869,7 +6601,7 @@ vprintf();
  
  ; return 0; }
  EOF
-if { (eval echo configure:5873: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:6605: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func_vprintf=yes"
  else
@@ -5893,12 +6625,12 @@ fi
  
  if test "$ac_cv_func_vprintf" != yes; then
  echo $ac_n "checking for _doprnt""... $ac_c" 1>&6
-echo "configure:5897: checking for _doprnt" >&5
+echo "configure:6629: checking for _doprnt" >&5
  if eval "test \"`echo '$''{'ac_cv_func__doprnt'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
    cat > conftest.$ac_ext <<EOF
-#line 5902 "configure"
+#line 6634 "configure"
  #include "confdefs.h"
  /* System header to define __stub macros and hopefully few prototypes,
      which can conflict with char _doprnt(); below.  */
@@ -5921,7 +6653,7 @@ _doprnt();
  
  ; return 0; }
  EOF
-if { (eval echo configure:5925: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:6657: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
    rm -rf conftest*
    eval "ac_cv_func__doprnt=yes"
  else
@@ -5990,18 +6722,19 @@ esac
  # Substitute things in output and header files.
  ########################################################################
  SUFFIX=""
+GMXLIB_COND_OBJ=""
  
  if test "$enable_mpi" = "yes"; then
         cat >> confdefs.h <<\EOF
  #define USE_MPI 
  EOF
  
-        PAR_OBJ='${mpi_obj}'
+        GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mpiio.lo"
          if test "$enable_mpi_suffix" = "yes"; then
           SUFFIX="_mpi"
         fi
  else
-       PAR_OBJ='${libnet_obj}' 
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libnet.lo"
  fi
  
  if test "$enable_vector" = "yes"; then
@@ -6016,7 +6749,12 @@ if test "$enable_fortran" = "yes"; then
  #define USE_FORTRAN 
  EOF
  
-       INNER_F77_OBJ='${inner_f77_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerf.lo f77_wrappers.lo"
+       if test "$enable_float" = "yes"; then
+          MDLIB_COND_OBJ="flincs.lo fsettle.lo fshake.lo"
+        else
+          MDLIB_COND_OBJ="flincsd.lo fsettled.lo fshaked.lo"
+       fi
         
  
  if true; then
@@ -6027,7 +6765,8 @@ else
    USE_FORTRAN_FALSE=
  fi
  else
-       INNER_C_OBJ='${inner_c_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerc.lo"
+        MDLIB_COND_OBJ="clincs.lo csettle.lo cshake.lo"
         
  
  if false; then
@@ -6039,7 +6778,7 @@ else
  fi
  fi
  
-if test "$enable_double" = "yes"; then
+if test "$enable_float" = "no"; then
         cat >> confdefs.h <<\EOF
  #define DOUBLE 
  EOF
@@ -6089,28 +6828,16 @@ EOF
  
  fi
  
-if test "$enable_sse" = "yes"; then
+if test "$enable_x86_asm" = "yes"; then
         cat >> confdefs.h <<\EOF
  #define USE_SSE 
  EOF
  
-       SSE_OBJ='${sse_obj}'
-fi
-
-if test "$enable_3dnow" = "yes"; then
-       cat >> confdefs.h <<\EOF
-#define USE_3DNOW 
-EOF
-
-       TDN_OBJ='${tdn_obj}'
-fi
-
-if test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"; then
-       X86_ASM_OBJ='${x86_asm_obj}'    
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} x86_cpuid.lo x86_sse.lo x86_3dnow.lo"
  fi
  
  if test "$motif_includes" != "none" -a "$motif_libraries" != "none"; then
-       MOTIF_OBJ='${motif_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mgmx.lo widget.lo"
  fi
  
  if test "$enable_xdr" = "yes"; then
@@ -6118,9 +6845,9 @@ if test "$enable_xdr" = "yes"; then
  #define USE_XDR 
  EOF
  
-       XDR_OBJ='${xdr_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libxdrf.lo ftocstr.lo"
  else
-       XDR_OBJ='${noxdr_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} dumxdrf.lo"
  fi
  
  if test "$enable_softwaresqrt" = "yes"; then
@@ -6297,18 +7024,6 @@ EOF
  
  fi
  
-
-
-
-
-
-
-
-
- # not used right now
-
-
-
  # Check if there are any optimizations and options for this arch and cpu
  
  
@@ -6320,6 +7035,8 @@ fi
  # determine our suggested choices for both C and fortran, and then possibly
  # override them with user choices.
  
+cc_vendor="unknown"
+
  case "${host_cpu}-${host_os}" in
  
    *-solaris2*) 
@@ -6490,6 +7207,7 @@ case "${host_cpu}-${host_os}" in
      esac
      if $CC -V 2>  /dev/null | grep Compaq > /dev/null 2>&1; then
        xCFLAGS="$tmpCFLAGS"
+      cc_vendor="Compaq"
      fi
      if test "$enable_fortran" = "yes"; then
        if $F77 -V 2>  /dev/null | grep Compaq > /dev/null 2>&1; then
@@ -6513,7 +7231,7 @@ case "${host_cpu}-${host_os}" in
        xCFLAGS="$pgiopt -fast -Minfo=loop -pc 32"
      fi
      if test "$enable_fortran" = "yes"; then
-      if $F77 -V 2>  /dev/null | grep Portland /dev/null 2>&1; then
+      if $F77 -V 2>  /dev/null | grep Portland > /dev/null 2>&1; then
         xFFLAGS="$xCFLAGS -Mneginfo=loop"
        fi       
      fi
@@ -6528,7 +7246,7 @@ if test $ac_cv_prog_gcc = yes; then
    
  
  echo $ac_n "checking whether $CC accepts -malign-double""... $ac_c" 1>&6
-echo "configure:6532: checking whether $CC accepts -malign-double" >&5
+echo "configure:7250: checking whether $CC accepts -malign-double" >&5
  if eval "test \"`echo '$''{'ac_align_double'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -6562,7 +7280,17 @@ if test $enable_fortran = yes; then
  fi
  
  CPU_FLAGS=""
+
  if test "$GCC" = "yes"; then
+  
+
+if true; then
+  GNU_CC_TRUE=
+  GNU_CC_FALSE='#'
+else
+  GNU_CC_TRUE='#'
+  GNU_CC_FALSE=
+fi
    # try to guess correct CPU flags, at least for linux
    case "${host_cpu}" in
      # i586/i686 cpu flags don't improve speed, thus no need to use them.
@@ -6575,7 +7303,7 @@ if test "$GCC" = "yes"; then
         
  
  echo $ac_n "checking whether $CC accepts -mcpu=$cputype""... $ac_c" 1>&6
-echo "configure:6579: checking whether $CC accepts -mcpu=$cputype" >&5
+echo "configure:7307: checking whether $CC accepts -mcpu=$cputype" >&5
  if eval "test \"`echo '$''{'ac_m_cpu_60x'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -6602,7 +7330,7 @@ fi
          
  
  echo $ac_n "checking whether $CC accepts -mcpu=750""... $ac_c" 1>&6
-echo "configure:6606: checking whether $CC accepts -mcpu=750" >&5
+echo "configure:7334: checking whether $CC accepts -mcpu=750" >&5
  if eval "test \"`echo '$''{'ac_m_cpu_750'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -6630,7 +7358,7 @@ fi
          
  
  echo $ac_n "checking whether $CC accepts -mcpu=powerpc""... $ac_c" 1>&6
-echo "configure:6634: checking whether $CC accepts -mcpu=powerpc" >&5
+echo "configure:7362: checking whether $CC accepts -mcpu=powerpc" >&5
  if eval "test \"`echo '$''{'ac_m_cpu_powerpc'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -6658,7 +7386,7 @@ fi
         
  
  echo $ac_n "checking whether $CC accepts -mpowerpc""... $ac_c" 1>&6
-echo "configure:6662: checking whether $CC accepts -mpowerpc" >&5
+echo "configure:7390: checking whether $CC accepts -mpowerpc" >&5
  if eval "test \"`echo '$''{'ac_m_powerpc'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -6683,6 +7411,16 @@ fi
  
        fi
     esac
+else
+  
+
+if false; then
+  GNU_CC_TRUE=
+  GNU_CC_FALSE='#'
+else
+  GNU_CC_TRUE='#'
+  GNU_CC_FALSE=
+fi
  fi
  
  if test -n "$CPU_FLAGS"; then
@@ -6703,14 +7441,14 @@ if test "$ac_test_CFLAGS" != "set"; then
      echo "*******************************************************************"
      echo "* WARNING: No special optimization settings found for the C       *"
      echo "* compiler. Use  make CFLAGS=..., or edit the top level Makefile. *"
-    echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it)  *"
+    echo "* Reverting to the default setting CFLAGS=-O3. (mail us about it!)*"
      echo "*******************************************************************"
      CFLAGS="-O3"
    fi
    
  
  echo $ac_n "checking whether $CC accepts ${CFLAGS}""... $ac_c" 1>&6
-echo "configure:6714: checking whether $CC accepts ${CFLAGS}" >&5
+echo "configure:7452: checking whether $CC accepts ${CFLAGS}" >&5
  if eval "test \"`echo '$''{'ac_guessed_cflags'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -6748,19 +7486,18 @@ fi
  if test "$enable_fortran" = "yes"; then        
    if test "$ac_test_FFLAGS" != "set"; then
      FFLAGS="$xFFLAGS"
-    
      if test -z "$FFLAGS"; then
        echo "*******************************************************************"
        echo "* WARNING: No special optimization settings found for the fortran *"
        echo "* compiler. Use  make FFLAGS=..., or edit the top level Makefile. *"
-      echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it) *"
+      echo "* Reverting to the default setting FFLAGS=-O3. (mail us about it!)*"
        echo "*******************************************************************"
        FFLAGS="-O3"
      fi
      
  
  echo $ac_n "checking whether $F77 accepts ${FFLAGS}""... $ac_c" 1>&6
-echo "configure:6764: checking whether $F77 accepts ${FFLAGS}" >&5
+echo "configure:7501: checking whether $F77 accepts ${FFLAGS}" >&5
  if eval "test \"`echo '$''{'ac_guessed_fflags'+set}'`\" = set"; then
    echo $ac_n "(cached) $ac_c" 1>&6
  else
@@ -6799,9 +7536,16 @@ fi
      echo "******************************************"
    fi
  fi
+  
  
            # should be automatic, but doesnt seem to be?
  
+
+
+
+
+
+
  # put binaries and libraries in subdirectories named as the arch
  if test -n "$gmxcpu"; then
    bindir="\${exec_prefix}/bin/${host}/${gmxcpu}"
@@ -6814,7 +7558,6 @@ fi
  
  
  
-
  trap '' 1 2 15
  cat > confcache <<\EOF
  # This file is a shell script that caches the results of configure
@@ -6981,13 +7724,7 @@ s%@AUTOCONF@%$AUTOCONF%g
  s%@AUTOMAKE@%$AUTOMAKE%g
  s%@AUTOHEADER@%$AUTOHEADER%g
  s%@MAKEINFO@%$MAKEINFO%g
-s%@AMTAR@%$AMTAR%g
-s%@install_sh@%$install_sh%g
-s%@AWK@%$AWK%g
  s%@SET_MAKE@%$SET_MAKE%g
-s%@AMDEP@%$AMDEP%g
-s%@AMDEPBACKSLASH@%$AMDEPBACKSLASH%g
-s%@DEPDIR@%$DEPDIR%g
  s%@host@%$host%g
  s%@host_alias@%$host_alias%g
  s%@host_cpu@%$host_cpu%g
@@ -6996,17 +7733,23 @@ s%@host_os@%$host_os%g
  s%@F77@%$F77%g
  s%@CC@%$CC%g
  s%@CPP@%$CPP%g
-s%@_am_include@%$_am_include%g
-s%@CCDEPMODE@%$CCDEPMODE%g
  s%@BUILD_CC@%$BUILD_CC%g
  s%@FLIBS@%$FLIBS%g
  s%@MPICC@%$MPICC%g
  s%@USE_MPI_TRUE@%$USE_MPI_TRUE%g
  s%@USE_MPI_FALSE@%$USE_MPI_FALSE%g
-s%@NASM@%$NASM%g
-s%@NASMFLAGS@%$NASMFLAGS%g
  s%@IDENT@%$IDENT%g
+s%@LN_S@%$LN_S%g
+s%@build@%$build%g
+s%@build_alias@%$build_alias%g
+s%@build_cpu@%$build_cpu%g
+s%@build_vendor@%$build_vendor%g
+s%@build_os@%$build_os%g
+s%@OBJEXT@%$OBJEXT%g
+s%@EXEEXT@%$EXEEXT%g
  s%@RANLIB@%$RANLIB%g
+s%@STRIP@%$STRIP%g
+s%@LIBTOOL@%$LIBTOOL%g
  s%@GMX_USE_XDR_TRUE@%$GMX_USE_XDR_TRUE%g
  s%@GMX_USE_XDR_FALSE@%$GMX_USE_XDR_FALSE%g
  s%@X_CFLAGS@%$X_CFLAGS%g
@@ -7020,18 +7763,13 @@ s%@USE_FORTRAN_TRUE@%$USE_FORTRAN_TRUE%g
  s%@USE_FORTRAN_FALSE@%$USE_FORTRAN_FALSE%g
  s%@USE_DOUBLE_TRUE@%$USE_DOUBLE_TRUE%g
  s%@USE_DOUBLE_FALSE@%$USE_DOUBLE_FALSE%g
-s%@PAR_OBJ@%$PAR_OBJ%g
-s%@INNER_F77_OBJ@%$INNER_F77_OBJ%g
-s%@INNER_C_OBJ@%$INNER_C_OBJ%g
-s%@SSE_OBJ@%$SSE_OBJ%g
-s%@TDN_OBJ@%$TDN_OBJ%g
-s%@X86_ASM_OBJ@%$X86_ASM_OBJ%g
-s%@MOTIF_OBJ@%$MOTIF_OBJ%g
-s%@XDR_OBJ@%$XDR_OBJ%g
-s%@AXP_ASM_OBJ@%$AXP_ASM_OBJ%g
+s%@GNU_CC_TRUE@%$GNU_CC_TRUE%g
+s%@GNU_CC_FALSE@%$GNU_CC_FALSE%g
+s%@INCLUDES@%$INCLUDES%g
+s%@GMXLIB_COND_OBJ@%$GMXLIB_COND_OBJ%g
+s%@MDLIB_COND_OBJ@%$MDLIB_COND_OBJ%g
  s=XXX_SUFFIX_XXX=$SUFFIX=g
  s%@SUFFIX@%$SUFFIX%g
-s%@INCLUDES@%$INCLUDES%g
  
  CEOF
  EOF
@@ -7264,60 +8002,11 @@ fi; done
  
  EOF
  cat >> $CONFIG_STATUS <<EOF
-am_indx=1
-  for am_file in src/include/config.h; do
-    case " $CONFIG_HEADERS " in
-    *" $am_file "*)
-      echo timestamp > `echo $am_file | sed 's%:.*%%;s%[^/]*$%%'`stamp-h$am_indx
-      ;;
-    esac
-    am_indx=\`expr \$am_indx + 1\`
-  done
-AMDEP="$AMDEP"
-ac_aux_dir="$ac_aux_dir"
+
  
  EOF
  cat >> $CONFIG_STATUS <<\EOF
-test -z "$CONFIG_HEADERS" || echo timestamp >     src/include/stamp-h
-
-test x"$AMDEP" != x"" ||
-for mf in $CONFIG_FILES; do
-  case "$mf" in
-  Makefile) dirpart=.;;
-  */Makefile) dirpart=`echo "$mf" | sed -e 's|/[^/]*$||'`;;
-  *) continue;;
-  esac
-  grep '^DEP_FILES *= *[^ #]' < "$mf" > /dev/null || continue
-  # Extract the definition of DEP_FILES from the Makefile without
-  # running `make'.
-  DEPDIR=`sed -n -e '/^DEPDIR = / s///p' < "$mf"`
-  test -z "$DEPDIR" && continue
-  # When using ansi2knr, U may be empty or an underscore; expand it
-  U=`sed -n -e '/^U = / s///p' < "$mf"`
-  test -d "$dirpart/$DEPDIR" || mkdir "$dirpart/$DEPDIR"
-  # We invoke sed twice because it is the simplest approach to
-  # changing $(DEPDIR) to its actual value in the expansion.
-  for file in `sed -n -e '
-    /^DEP_FILES = .*\\\\$/ {
-      s/^DEP_FILES = //
-      :loop
-       s/\\\\$//
-       p
-       n
-       /\\\\$/ b loop
-      p
-    }
-    /^DEP_FILES = / s/^DEP_FILES = //p' < "$mf" | \
-       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
-    # Make sure the directory exists.
-    test -f "$dirpart/$file" && continue
-    fdir=`echo "$file" | sed -e 's|/[^/]*$||'`
-    $ac_aux_dir/mkinstalldirs "$dirpart/$fdir" > /dev/null 2>&1
-    # echo "creating $dirpart/$file"
-    echo '# dummy' > "$dirpart/$file"
-  done
-done
-
+test -z "$CONFIG_HEADERS" || echo timestamp > src/include/stamp-h
  
  exit 0
  EOF
@@ -7327,7 +8016,7 @@ test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1
  
  
  echo ""
-echo "Gromacs is ready to compile. Summary of options used:"
+echo "GROMACS is ready to compile. Summary of main options:"
  echo "Architecture                  : $host"
  if test "$enable_cpu_detection" = "yes"; then
  if test -n "$gmxcpu"; then
@@ -7336,35 +8025,30 @@ else
  echo "(Extra CPU detection not necessary or unavailable on this host)" 
  fi
  fi
+echo "Vector architecture           : $enable_vector"
  echo "MPI parallelization           : $enable_mpi"
  if test "$enable_mpi" = "yes"; then
  echo "Checking MPI environment      : $with_mpi_environment"
  echo "MPI suffix on files           : $enable_mpi_suffix"
  fi
-echo "Vector architecture           : $enable_vector"
  echo "Using Fortran code            : $enable_fortran"
-echo "Double precision              : $enable_double"
-if test "$enable_double" = "yes"; then
-echo "Type suffix on files          : $enable_type_suffix"
+echo "Single precision              : $enable_float"
+if test "$enable_float" = "no"; then
+echo "Suffix on double prec. files  : $enable_type_suffix"
  fi
-echo "Expanding water loops         : $enable_simplewater"
-echo "Using water-water loops       : $enable_waterwater_loops"
  echo "Automatically nice mdrun      : $enable_nice"
-echo "Using x86 SSE assembly        : $enable_sse"
-echo "Using x86 3DNow assembly      : $enable_3dnow"
-echo "Portable trajectories (xdr)   : $enable_xdr"
+echo "Using x86 SSE/3DNow assembly  : $enable_x86_asm"
  echo "Software 1/x                  : $enable_softwarerecip"
  echo "Software 1/sqrt(x)            : $enable_softwaresqrt"
  echo "Vectorize 1/x                 : $enable_vectorized_recip"
  echo "Vectorize 1/sqrt(x)           : $list_of_vectorized_sqrt"
  echo "Prefetch coordinates in loops : $list_of_prefetch_x"
  echo "Prefetch forces in loops      : $list_of_prefetch_f"
-echo "Hide square latency           : $enable_hide_square_latency"
-echo "Hide table lookup latency     : $enable_hide_table_latency"
-echo "Using X11                     : $use_x11"
+echo "X11 support                   : $use_x11"
  echo "Motif support                 : $use_motif"
  echo ""
  echo "GROMACS will be installed under $prefix"
  echo "Make sure to update your PATH and MANPATH to find the"
-echo "programs and unix manual pages."
+echo "programs and unix manual pages, and possibly LD_LIBRARY_PATH"
+echo "or /etc/ld.so.conf if you are using dynamic libraries"
  
diff --git a/configure.in b/configure.in

index 59193d4dd0a219d2469b36780bbc326122ca0df1..18838b452c218ce68bb2629cbacd972b7276b45a 100644 (file)
--- a/configure.in
+++ b/configure.in
@@ -3,8 +3,11 @@
  #######################################################################
  AC_INIT(src/gmxlib/3dview.c)
  AC_PREREQ(2.13)
-AC_CONFIG_AUX_DIR(./config)
+AC_CONFIG_AUX_DIR(config)
  AM_INIT_AUTOMAKE(gromacs, 3.0)
+dnl This is the version info according to the libtool versioning system.
+dnl It does *not* correspond to the release number.
+SHARED_VERSION_INFO="1:0:0"
  AC_PREFIX_DEFAULT(/usr/local/gromacs)
  AM_CONFIG_HEADER(src/include/config.h)
  
@@ -15,7 +18,7 @@ AM_CONFIG_HEADER(src/include/config.h)
  #####
  
  AC_ARG_ENABLE(mpi,     
- [  --enable-mpi                  Compile parallel version of Gromacs], 
+ [  --enable-mpi                  Compile parallel version of GROMACS], 
   enable_mpi=$enableval, enable_mpi=no)
  
  
@@ -29,15 +32,15 @@ AC_ARG_ENABLE(vector,
  #####
  
  AC_ARG_ENABLE(fortran, 
- [  --enable-fortran              Dortran loops (default on sgi,ibm,sun,tru64/dec)], 
+ [  --enable-fortran              Fortran loops (default on sgi,ibm,sun,axp)], 
   enable_fortran=$enableval,enable_fortran=check)
  
  
  #####
  
-AC_ARG_ENABLE(double,  
- [  --enable-double               Compile double precision Gromacs], 
- enable_double=$enableval, enable_double=no)
+AC_ARG_ENABLE(float,  
+ [  --enable-float                Compile single precision GROMACS], 
+ enable_float=$enableval, enable_float=yes)
  
  
  #####
@@ -53,7 +56,6 @@ AC_ARG_ENABLE(mpi-suffix,
   [  --enable-mpi-suffix           Add a suffix to MPI files (default on ibm)], 
   enable_mpi_suffix=$enableval, enable_mpi_suffix=check)
  
-
  #####
  
  AC_ARG_ENABLE(simplewater,     
@@ -76,16 +78,9 @@ AC_ARG_ENABLE(nice,
  
  #####
  
-AC_ARG_ENABLE(sse,     
- [  --disable-sse                 Disable SSE assembly loops on x86], 
- enable_sse=$enableval, enable_sse=yes)
-
-
-#####
-
-AC_ARG_ENABLE(3dnow,     
- [  --disable-3dnow               Disable 3DNow assembly loops on x86], 
- enable_3dnow=$enableval, enable_3dnow=yes)
+AC_ARG_ENABLE(x86_asm,     
+ [  --disable-x86-asm             Disable assembly loops on x86], 
+ enable_x86_asm=$enableval, enable_x86_asm=yes)
  
  
  #####
@@ -180,7 +175,7 @@ esac
  if test "$enable_fortran" = "check"; then
  case "${host_cpu}-${host_os}" in
  
-  sparc*-solaris* | alpha*-osf* | rs6000*-aix* | mips*-irix*)
+  sparc*-solaris* | alpha*-* | rs6000*-aix* | mips*-irix*)
      enable_fortran=yes 
      ;;
  
@@ -261,19 +256,31 @@ case "${host_cpu}" in
  esac
  
  if test "$enable_fortran" = "yes"; then
-  # vendor f77 before g77
-  AC_CHECK_PROGS(F77, f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90)
+  # vendor f77 before g77 - but special compiler list for alpha-linux
+  case "${host_cpu}-${host_os}" in
+    alpha*-linux*)
+      AC_CHECK_PROGS(F77, fort f77 g77)
+      ;;
+    *)   
+      AC_CHECK_PROGS(F77, f77 xlf xlf77 cf77 fl32 g77 fort77 f90 xlf90 pgf77 cf77 fort fort77 pgf90)
+      ;;
+  esac
    AC_PROG_F77
    if test -z "$F77"; then
      AC_MSG_ERROR([No fortran compiler found])
    fi
  fi
  
-# Checks for programs.
-AC_PROG_MAKE_SET
-
-AC_CHECK_PROG(CC, cc, cc) # do vendor cc before gcc
+case "${host_cpu}-${host_os}" in
+    alpha*-linux*)
+      AC_CHECK_PROGS(CC, ccc cc) # do vendor cc before gcc
+      ;;
+    *)   
+      AC_CHECK_PROG(CC, cc, cc) # do vendor cc before gcc
+      ;;
+esac
  AC_PROG_CC
+AC_PROG_CPP
  BUILD_CC=$CC
  AC_SUBST(BUILD_CC)
  
@@ -283,7 +290,7 @@ if test "$enable_fortran" = "yes"; then
  fi
  
  # if we are using mpi, also get an MPICC. We cannot set that in the PROG_CC macro
-# above, since the autoconf checks that the created file can be executed. This would
+# above, since autoconf checks that the created file can be executed. This would
  # fail on platforms where MPI executables can only be run through a batchqueue.
  
  if test "$enable_mpi" = "yes"; then
@@ -299,65 +306,35 @@ else
    AM_CONDITIONAL(USE_MPI,false)
  fi
  
-AC_PROG_INSTALL
-AC_PROG_CPP
-
-# A rather complicated check for the nasm program and x86 assembly capabilities
-# to run under windows we must insert a test a change nasm "-f elf" to "-f win32"
+# A rather complicated check for the capabilities of as, to make
+# sure we can compile the assembly innerloops.
  if test "$x86" = "yes"; then
-  if [ test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"]; then
-    if test "$enable_double" = "yes"; then
-      AC_MSG_WARN([SSE/3Dnow assembly can only be used in single precision])
-      enable_sse=no
-      enable_3dnow=no
+  if [ test "$enable_x86_asm" = "yes"]; then
+    if test "$enable_float" = "no"; then
+      AC_MSG_WARN([The assembly loops can only be used in single precision - disabling])
+      enable_x86_asm=no
      else 
-      AC_PATH_PROG(NASM,nasm,no)
-      NASMFLAGS="-f elf" 
-      AC_SUBST(NASMFLAGS)
-      if test "$NASM" = "no"; then 
-         AC_MSG_ERROR([Nasm is required for SSE and 3DNow loops.])
-      fi
-      if test "$enable_sse" = "yes"; then
-        AC_MSG_CHECKING([whether nasm supports SSE instructions])
-cat > conftest_sse.s << EOF
-       global checksse 
-checksse:
+      AC_MSG_CHECKING([whether as fully supports intel syntax])
+cat > conftest.s << EOF
+.intel_syntax noprefix 
+checkasm:
         emms
-       xorps xmm0,xmm0
-       emms
-       ret
-EOF
-        if AC_TRY_COMMAND($NASM conftest_sse.s); then
-         AC_MSG_RESULT([yes])
-        else
-         AC_MSG_RESULT([no]) 
-         AC_MSG_ERROR([Download a patched nasm from the Gromacs homepage,]     
-                       [or disable SSE assembly.])
-        fi
-      fi       
-      if test "$enable_3dnow" = "yes"; then
-        AC_MSG_CHECKING([whether nasm supports extended 3DNow instructions])
-cat > conftest_3dnow.s << EOF
-       global check3dnow       
-check3dnow:    
-       femms
         pswapd mm0,mm0
-       femms
+       movups xmm0,[checkasm]
+       emms
         ret
  EOF
-        if AC_TRY_COMMAND([$NASM -f elf conftest_3dnow.s]); then
-         AC_MSG_RESULT([yes])
-        else
-         AC_MSG_RESULT([no])
-         AC_MSG_ERROR([Download a patched nasm from the Gromacs homepage,]
-                       [or disable 3DNow assembly.])
-        fi
+      if AC_TRY_COMMAND($CC -c conftest.s); then
+       AC_MSG_RESULT([yes])
+      else
+        AC_MSG_RESULT([no]) 
+       AC_MSG_ERROR([Upgrade to binutils>=2.11, download the as executable]     
+                    [from www.gromacs.org, or disable assembly loops.])
        fi
-    fi
+    fi 
    fi
  else # not x86
-enable_sse=no
-enable_3dnow=no
+enable_x86_asm=no
  fi
  
  AC_PATH_PROG(IDENT,ident,no)
@@ -371,7 +348,9 @@ if test "$IDENT" != "no"; then
      AC_MSG_RESULT([no]))
  fi
  
-AC_PROG_RANLIB
+
+AC_PROG_LN_S
+AM_PROG_LIBTOOL
  AC_CHECK_FUNCS(strcasecmp)
  AC_CHECK_FUNCS(strdup)
  
@@ -410,10 +389,10 @@ if test "${host_vendor}" = "ibm"; then
    fi
  fi 
  
-if test "$enable_double" = "yes"; then
-  precision=8
-else
+if test "$enable_float" = "yes"; then
    precision=4
+else
+  precision=8
  fi
  
  if test "$enable_mpi" = "yes"; then
@@ -438,9 +417,9 @@ esac
  
  ######
  if test "$enable_xdr" = "no"; then
-  AC_MSG_WARN([* Not using XDR cripples Gromacs significantly. You won't be able to *]
-              [* read or write any hardware independent or compressed trajectories. *]
-              [* We strongly suggest you try to locate the RPC routines instead!    *])
+  AC_MSG_WARN([* Not using XDR cripples GROMACS significantly. You won't be able to *]
+              [* read or write any compressed trajectories. You have no choice on   *]
+              [* windows, but if you run UNIX locate the RPC files - you have them! *])
  else
  # check for xtc headers
    AC_CHECK_HEADERS(rpc/rpc.h rpc/xdr.h,,AC_MSG_ERROR([RPC/XDR include headers not found]))
@@ -519,15 +498,16 @@ esac
  # Substitute things in output and header files.
  ########################################################################
  SUFFIX=""
+GMXLIB_COND_OBJ=""
  
  if test "$enable_mpi" = "yes"; then
-       AC_DEFINE(USE_MPI,,[Make a parallel version of Gromacs using MPI])
-        PAR_OBJ='${mpi_obj}'
+       AC_DEFINE(USE_MPI,,[Make a parallel version of GROMACS using MPI])
+        GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mpiio.lo"
          if test "$enable_mpi_suffix" = "yes"; then
           SUFFIX="_mpi"
         fi
  else
-       PAR_OBJ='${libnet_obj}' 
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libnet.lo"
  fi
  
  if test "$enable_vector" = "yes"; then
@@ -536,14 +516,20 @@ fi
  
  if test "$enable_fortran" = "yes"; then
         AC_DEFINE(USE_FORTRAN,,[Use Fortran for innerloops and some other core stuff])
-       INNER_F77_OBJ='${inner_f77_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerf.lo f77_wrappers.lo"
+       if test "$enable_float" = "yes"; then
+          MDLIB_COND_OBJ="flincs.lo fsettle.lo fshake.lo"
+        else
+          MDLIB_COND_OBJ="flincsd.lo fsettled.lo fshaked.lo"
+       fi
         AM_CONDITIONAL(USE_FORTRAN,true)
  else
-       INNER_C_OBJ='${inner_c_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} innerc.lo"
+        MDLIB_COND_OBJ="clincs.lo csettle.lo cshake.lo"
         AM_CONDITIONAL(USE_FORTRAN,false)
  fi
  
-if test "$enable_double" = "yes"; then
+if test "$enable_float" = "no"; then
         AC_DEFINE(DOUBLE,,[Compile in double precision])
          if test "$enable_type_suffix" = "yes"; then
           SUFFIX="${SUFFIX}_d"
@@ -565,37 +551,28 @@ if test "$enable_nice" = "no"; then
         AC_DEFINE(NO_NICE,,[Turn off the automatic nicing of gromacs])
  fi
  
-if test "$enable_sse" = "yes"; then
-       AC_DEFINE(USE_SSE,,[Use x86 assembly with SSE instructions])
-       SSE_OBJ='${sse_obj}'
-fi
-
-if test "$enable_3dnow" = "yes"; then
-       AC_DEFINE(USE_3DNOW,,[Use x86 assembly with 3DNow instructions])
-       TDN_OBJ='${tdn_obj}'
-fi
-
-if [test "$enable_sse" = "yes" -o "$enable_3dnow" = "yes"]; then
-       X86_ASM_OBJ='${x86_asm_obj}'    
+if test "$enable_x86_asm" = "yes"; then
+       AC_DEFINE(USE_SSE,,[Use x86 assembly loops])
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} x86_cpuid.lo x86_sse.lo x86_3dnow.lo"
  fi
  
  if [test "$motif_includes" != "none" -a "$motif_libraries" != "none"]; then
-       MOTIF_OBJ='${motif_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} mgmx.lo widget.lo"
  fi
  
  if test "$enable_xdr" = "yes"; then
         AC_DEFINE(USE_XDR,,[Use xdr routines to make trajectories portable])
-       XDR_OBJ='${xdr_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} libxdrf.lo ftocstr.lo"
  else
-       XDR_OBJ='${noxdr_obj}'
+       GMXLIB_COND_OBJ="${GMXLIB_COND_OBJ} dumxdrf.lo"
  fi
  
  if test "$enable_softwaresqrt" = "yes"; then
-       AC_DEFINE(SOFTWARE_SQRT,,[Use the Gromacs software 1/sqrt(x)])
+       AC_DEFINE(SOFTWARE_SQRT,,[Use the GROMACS software 1/sqrt(x)])
  fi
  
  if test "$enable_softwarerecip" = "yes"; then
-       AC_DEFINE(SOFTWARE_RECIP,,[Use the Gromacs software 1/x])
+       AC_DEFINE(SOFTWARE_RECIP,,[Use the GROMACS software 1/x])
  fi
  
  if test "$enable_hide_square_latency" = "yes"; then
@@ -705,22 +682,16 @@ else
    AC_DEFINE_UNQUOTED(GMXLIBDIR,"${prefix}/top",[Default topology file location])
  fi
  
-AC_SUBST(PAR_OBJ)
-AC_SUBST(INNER_F77_OBJ)
-AC_SUBST(INNER_C_OBJ)
-AC_SUBST(SSE_OBJ)
-AC_SUBST(TDN_OBJ)
-AC_SUBST(X86_ASM_OBJ)
-AC_SUBST(MOTIF_OBJ)
-AC_SUBST(XDR_OBJ)
-AC_SUBST(AXP_ASM_OBJ) # not used right now
-ACX_SUBST_XXX(SUFFIX)
-AC_SUBST(SUFFIX)
-
  # Check if there are any optimizations and options for this arch and cpu
  ACX_COMPILER_MAXOPT
  AC_SUBST(INCLUDES)          # should be automatic, but doesnt seem to be?
  
+AC_SUBST(GMXLIB_COND_OBJ)
+AC_SUBST(MDLIB_COND_OBJ)
+ACX_SUBST_XXX(SUFFIX)
+AC_SUBST(SUFFIX)
+
+
  # put binaries and libraries in subdirectories named as the arch
  if test -n "$gmxcpu"; then
    bindir="\${exec_prefix}/bin/${host}/${gmxcpu}"
@@ -733,7 +704,6 @@ fi
  AC_SUBST(bindir)
  AC_SUBST(libdir)
  
-
  AC_OUTPUT([Makefile
                   src/Makefile
                   src/gmxlib/Makefile
@@ -760,7 +730,7 @@ AC_OUTPUT([Makefile
                   man/Makefile])
  
  echo ""
-echo "Gromacs is ready to compile. Summary of options used:"
+echo "GROMACS is ready to compile. Summary of main options:"
  echo "Architecture                  : $host"
  if test "$enable_cpu_detection" = "yes"; then
  if test -n "$gmxcpu"; then
@@ -769,35 +739,30 @@ else
  echo "(Extra CPU detection not necessary or unavailable on this host)" 
  fi
  fi
+echo "Vector architecture           : $enable_vector"
  echo "MPI parallelization           : $enable_mpi"
  if test "$enable_mpi" = "yes"; then
  echo "Checking MPI environment      : $with_mpi_environment"
  echo "MPI suffix on files           : $enable_mpi_suffix"
  fi
-echo "Vector architecture           : $enable_vector"
  echo "Using Fortran code            : $enable_fortran"
-echo "Double precision              : $enable_double"
-if test "$enable_double" = "yes"; then
-echo "Type suffix on files          : $enable_type_suffix"
+echo "Single precision              : $enable_float"
+if test "$enable_float" = "no"; then
+echo "Suffix on double prec. files  : $enable_type_suffix"
  fi
-echo "Expanding water loops         : $enable_simplewater"
-echo "Using water-water loops       : $enable_waterwater_loops"
  echo "Automatically nice mdrun      : $enable_nice"
-echo "Using x86 SSE assembly        : $enable_sse"
-echo "Using x86 3DNow assembly      : $enable_3dnow"
-echo "Portable trajectories (xdr)   : $enable_xdr"
+echo "Using x86 SSE/3DNow assembly  : $enable_x86_asm"
  echo "Software 1/x                  : $enable_softwarerecip"
  echo "Software 1/sqrt(x)            : $enable_softwaresqrt"
  echo "Vectorize 1/x                 : $enable_vectorized_recip"
  echo "Vectorize 1/sqrt(x)           : $list_of_vectorized_sqrt"
  echo "Prefetch coordinates in loops : $list_of_prefetch_x"
  echo "Prefetch forces in loops      : $list_of_prefetch_f"
-echo "Hide square latency           : $enable_hide_square_latency"
-echo "Hide table lookup latency     : $enable_hide_table_latency"
-echo "Using X11                     : $use_x11"
+echo "X11 support                   : $use_x11"
  echo "Motif support                 : $use_motif"
  echo ""
  echo "GROMACS will be installed under $prefix"
  echo "Make sure to update your PATH and MANPATH to find the"
-echo "programs and unix manual pages."
+echo "programs and unix manual pages, and possibly LD_LIBRARY_PATH"
+echo "or /etc/ld.so.conf if you are using dynamic libraries"
  
diff --git a/include/config.h.in b/include/config.h.in

index 4b15bde1267db249d48f79768a59ac734af0d856..e29b6245f7c53a84054c6b37eb1a02ccc101e2f2 100644 (file)
--- a/include/config.h.in
+++ b/include/config.h.in
@@ -111,7 +111,7 @@
  /* Use motif/lesstif libraries */
  #undef HAVE_MOTIF
  
-/* Make a parallel version of Gromacs using MPI */
+/* Make a parallel version of GROMACS using MPI */
  #undef USE_MPI
  
  /* Optimize for a vector architecture */
@@ -132,19 +132,16 @@
  /* Turn off the automatic nicing of gromacs */
  #undef NO_NICE
  
-/* Use x86 assembly with SSE instructions */
+/* Use x86 assembly loops */
  #undef USE_SSE
  
-/* Use x86 assembly with 3DNow instructions */
-#undef USE_3DNOW
-
  /* Use xdr routines to make trajectories portable */
  #undef USE_XDR
  
-/* Use the Gromacs software 1/sqrt(x) */
+/* Use the GROMACS software 1/sqrt(x) */
  #undef SOFTWARE_SQRT
  
-/* Use the Gromacs software 1/x */
+/* Use the GROMACS software 1/x */
  #undef SOFTWARE_RECIP
  
  /* Try to get coordinates to cache before using them */
diff --git a/include/copyrite.h b/include/copyrite.h

index 88801f9399c7696c1ff4aa5e1bb0e6ba786d242a..430f9e13db06741bee077a3c094eb270b71db027 100644 (file)
--- a/include/copyrite.h
+++ b/include/copyrite.h
@@ -59,8 +59,13 @@ char *GromacsVersion();
  static char *CopyrightText[] = {
    "",
    "Copyright (c) 1991-2001",
-  "BIOSON Research Institute, Dept. of Biophysical Chemistry",
-  "University of Groningen, The Netherlands",
+  "Dept. of Biophysical Chemistry, University of Groningen, The Netherlands",
+  "For additional resources, check out http://www.gromacs.org"
+  ""
+  "This program is free software; you can redistribute it and/or"
+  "modify it under the terms of the GNU General Public License"
+  "as published by the Free Software Foundation; either version 2"
+  "of the License, or (at your option) any later version."
    ""
  };
    
diff --git a/man/.cvsignore b/man/.cvsignore

new file mode 100644 (file)

index 0000000..70845e0
--- /dev/null
+++ b/man/.cvsignore
@@ -0,0 +1 @@
+Makefile.in
diff --git a/man/Makefile.am b/man/Makefile.am

new file mode 100644 (file)

index 0000000..35c6f27
--- /dev/null
+++ b/man/Makefile.am
@@ -0,0 +1,25 @@
+## Process this file with automake to produce Makefile.in
+#
+# Don't edit - this file is generated automatically from Makefile.am
+#
+
+man_MANS =     g_dih.1      g_msd.1        g_tcaf.1     nmrun.1      \
+               do_dssp.1    g_dipoles.1    g_nmeig.1    g_traj.1     \
+                pdb2gmx.1    editconf.1     g_disre.1    g_nmens.1    \
+                g_velacc.1   protonate.1    eneconv.1    g_dist.1     \
+                g_order.1    genbox.1       tpbconv.1    g_anaeig.1   \
+                g_dyndom.1   g_potential.1  genconf.1    trjcat.1     \
+                g_analyze.1  g_enemat.1     g_rama.1     genion.1     \
+                trjconv.1    g_angle.1      g_energy.1   g_rdf.1      \
+                genpr.1      trjorder.1     g_bond.1     g_gyrate.1   \
+                g_rms.1      gmxcheck.1     wheel.1      g_bundle.1   \
+                g_h2order.1  g_rmsdist.1    gmxdump.1    x2top.1      \
+                g_chi.1      g_hbond.1      g_rmsf.1     grompp.1     \
+                xpm2ps.1     g_cluster.1    g_helix.1    g_rotacf.1   \
+                highway.1    xrama.1        g_confrms.1  g_lie.1      \
+                g_saltbr.1   make_ndx.1     g_covar.1    g_mdmat.1    \
+                g_sas.1      mdrun.1        g_density.1  g_mindist.1  \
+                g_sgangle.1  mk_angndx.1    g_morph.1    g_sorient.1  \
+                ngmx.1       g_dielectric.1  
+
+EXTRA_DIST = ${man_MANS}
+\ No newline at end of file
diff --git a/share/html/gif/gmxlogo_small.jpg b/share/html/gif/gmxlogo_small.jpg

new file mode 100644 (file)

index 0000000..55c9df7

Binary files /dev/null and b/share/html/gif/gmxlogo_small.jpg differ
diff --git a/share/html/online.html b/share/html/online.html

index d2b184f60c0b6baabdefc84647d67a082dc1207f..21eef4d711fb09d6d9df666c1039806c76193c97 100644 (file)
--- a/share/html/online.html
+++ b/share/html/online.html
@@ -1,10 +1,29 @@
  <HTML>
+<HEAD>
  <TITLE>GROMACS 3.0 Online Reference </TITLE>
+</HEAD>
  <LINK rel=stylesheet href="online/style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>GROMACS 3.0 Online Reference</H2>
-<HR>
-<P>
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+
+<table WIDTH="800" NOSAVE NOBORDER >
+<tr NOSAVE>
+<td WIDTH="120" HEIGHT="140" NOSAVE><a href="http://www.gromacs.org/"><img SRC="
+gif/gmxlogo_small.jpg" BORDER=0 height=133 width=116></a></td>
+
+<td ALIGN=LEFT VALIGN=TOP WIDTH=480 NOSAVE>
+<br><br>
+<h2>
+GROMACS 3.0<br>
+Online Reference</h2>
+</td>
+<td ALIGN=RIGHT VALIGN=BOTTOM WIDTH=200 NOSAVE>
+<B>VERSION 3.0<br>
+Tue 15 May 2001</B></td>
+</tr>
+</table>
+
+<hr>
+
  <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=10>
  <TR>
  <TD VALIGN=top WIDTH="25%">
@@ -96,6 +115,7 @@
  <br><a href=online/trjorder.html>trjorder</a>
  <br><a href=online/wheel.html>wheel</a>
  <br><a href=online/x2top.html>x2top</a>
+<br><a href=online/xmdrun.html>xmdrun</a>
  <br><a href=online/xpm2ps.html>xpm2ps</a>
  <br><a href=online/xrama.html>xrama</a>
  </multicol>
@@ -145,6 +165,7 @@
  <TR><TD><A HREF="online/grompp.html">grompp</A><TD>makes a run input file
  <TR><TD><A HREF="online/tpbconv.html">tpbconv</A><TD>makes a run input file for restarting a crashed run
  <TR><TD><A HREF="online/mdrun.html">mdrun</A><TD>performs a simulation
+<TR><TD><A HREF="online/xmdrun.html">xmdrun</A><TD>performs simulations with extra experimental features
  </TABLE>
  
  <A NAME="HNR3">
diff --git a/share/html/online/do_dssp.html b/share/html/online/do_dssp.html

index 8e6588a08c4691cb45d65c5bfc6225aaaa2c211d..64cc81b5c80f7d6c5a194e64f35cb8bc3432525a 100644 (file)
--- a/share/html/online/do_dssp.html
+++ b/share/html/online/do_dssp.html
@@ -1,13 +1,13 @@
-<TITLE>do_dssp</TITLE>
+<HTML>/n<HEAD>/n<TITLE>do_dssp</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>do_dssp</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>do_dssp</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  do_dssp 
  reads a trajectory file and computes the secondary structure for
@@ -56,9 +56,9 @@ function of secondary structure type.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-sss</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>HEBT</tt> </TD><TD> Secondary structures for structure count </TD></TD>
  </TABLE>
diff --git a/share/html/online/editconf.html b/share/html/online/editconf.html

index 98d178f7f306755c1dab73b2d9c6531cafb3ef60..a2bc9b59efd377a267cb1f968ae7aec3b28d7089 100644 (file)
--- a/share/html/online/editconf.html
+++ b/share/html/online/editconf.html
@@ -1,13 +1,13 @@
-<TITLE>editconf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>editconf</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>editconf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>editconf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  editconf converts generic structure format to <tt>.<a href="gro.html">gro</a></tt>, <tt>.<a href="g96.html">g96</a></tt>
  or <tt>.<a href="pdb.html">pdb</a></tt>.
@@ -78,7 +78,11 @@ The option -grasp is similar, but it puts the charges in the B-factor
  and the radius in the occupancy.
  <p>
  Finally with option <tt>-label</tt> editconf can add a chain identifier
-to a <a href="pdb.html">pdb</a> file, which can be useful for analysis with e.g. rasmol.
+to a <a href="pdb.html">pdb</a> file, which can be useful for analysis with e.g. rasmol.<p>
+To convert a truncated octrahedron file produced by a package which uses
+a cubic box with the corners cut off (such as Gromos) use:<br>
+<tt>editconf -f &lt;infile&gt; -rotate 0 -45 -35.2644 -bt o -box &lt;veclen&gt; -o &lt;outfile&gt;</tt><br>
+where <tt>veclen</tt> is the size of the cubic box times sqrt(3)/2.
  <P>
  <H3>Files</H3>
  <TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
diff --git a/share/html/online/eneconv.html b/share/html/online/eneconv.html

index cbc74cd0ea7d986d0de7aa5512453ced5be0fc78..0b9b62861528c9c83df0ba04a77ec3bce157b53a 100644 (file)
--- a/share/html/online/eneconv.html
+++ b/share/html/online/eneconv.html
@@ -1,13 +1,13 @@
-<TITLE>eneconv</TITLE>
+<HTML>/n<HEAD>/n<TITLE>eneconv</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>eneconv</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>eneconv</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  When <tt>-f</tt> is <it>not</it> specified:<br>
  Concatenates several energy files in sorted order.
diff --git a/share/html/online/g_anaeig.html b/share/html/online/g_anaeig.html

index 0a306c4b41637d87c5ac88e8d47e74c1f21fb4f3..a92a5f87b2d24ab42065ff1c570557e660ef1980 100644 (file)
--- a/share/html/online/g_anaeig.html
+++ b/share/html/online/g_anaeig.html
@@ -1,13 +1,13 @@
-<TITLE>g_anaeig</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_anaeig</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_anaeig</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_anaeig</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  <tt>g_anaeig</tt> analyzes eigenvectors. The eigenvectors can be of a
  covariance matrix (<tt><a href="g_covar.html">g_covar</a></tt>) or of a Normal Modes anaysis
diff --git a/share/html/online/g_analyze.html b/share/html/online/g_analyze.html

index 8a2da1d677bd50e25fa42c3c65388b6d24771ea3..524ac3348ea6a76fd26fb2b0d36af18bfaf17818 100644 (file)
--- a/share/html/online/g_analyze.html
+++ b/share/html/online/g_analyze.html
@@ -1,13 +1,13 @@
-<TITLE>g_analyze</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_analyze</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_analyze</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_analyze</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_analyze reads an ascii file and analyzes data sets.
  A line in the input file may start with a time
diff --git a/share/html/online/g_angle.html b/share/html/online/g_angle.html

index ece217718ec79ad95f6d5c5284c02cc105e973a1..6d2d492ace96317b4caed651222be79a0a3da071 100644 (file)
--- a/share/html/online/g_angle.html
+++ b/share/html/online/g_angle.html
@@ -1,13 +1,13 @@
-<TITLE>g_angle</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_angle</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_angle</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_angle</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_angle computes the angle distribution for a number of angles
  or dihedrals. This way you can check whether your simulation
@@ -42,9 +42,9 @@ If this is not the case, the program will crash.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-type</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>angle</tt> </TD><TD> Type of angle to analyse: angle, dihedral, improper or ryckaert-bellemans </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]all</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Plot all angles separately in the averages file, in the order of appearance in the index file. </TD></TD>
diff --git a/share/html/online/g_bond.html b/share/html/online/g_bond.html

index 5f90b8279ac2b5c999e02f996d84bcb0e0bf919f..fc580c76d107ee9d4d80bbde6ac77f3a34fa300e 100644 (file)
--- a/share/html/online/g_bond.html
+++ b/share/html/online/g_bond.html
@@ -1,13 +1,13 @@
-<TITLE>g_bond</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_bond</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_bond</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_bond</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_bond makes a distribution of bond lengths. If all is well a
  gaussian distribution should be made when using a harmonic potential.
@@ -32,9 +32,9 @@ a tol of 0.1 gives a distribution from 0.18 to 0.22
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-blen</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Bond length. By default length of first bond </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-tol</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>   0.1</tt> </TD><TD> Half width of distribution as fraction of blen </TD></TD>
diff --git a/share/html/online/g_bundle.html b/share/html/online/g_bundle.html

index bcc2e4fd7da4ff2ad5f26fa17a2ff53e51e5c90a..d1a428a0c0a88c1317999ed169ab74c556535561 100644 (file)
--- a/share/html/online/g_bundle.html
+++ b/share/html/online/g_bundle.html
@@ -1,13 +1,13 @@
-<TITLE>g_bundle</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_bundle</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_bundle</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_bundle</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_bundle analyzes bundles of axes. The axes can be for instance
  helix axes. The program reads two index groups and divides both
@@ -43,9 +43,9 @@ display the reference axis.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-na</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Number of axes </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]z</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use the Z-axis as reference iso the average axis </TD></TD>
  </TABLE>
diff --git a/share/html/online/g_chi.html b/share/html/online/g_chi.html

index 5624333119c6c3cc7682f444d29be319f6b305c1..afaf7b0af9b0ca67afa1e1381ac1c251fc228553 100644 (file)
--- a/share/html/online/g_chi.html
+++ b/share/html/online/g_chi.html
@@ -1,13 +1,13 @@
-<TITLE>g_chi</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_chi</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_chi</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_chi</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_chi computes phi, psi, omega and chi dihedrals for all your 
  amino acid backbone and sidechains.
@@ -46,9 +46,9 @@ the average omega angle is plotted using color coding.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-r0</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> starting residue </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]phi</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Output for Phi dihedral angles </TD></TD>
diff --git a/share/html/online/g_cluster.html b/share/html/online/g_cluster.html

index 7c7cb5f399c57197e411f24e04b991d39f69e205..51cc6fe29a44c2b54895d8a3f57d32993b4b2578 100644 (file)
--- a/share/html/online/g_cluster.html
+++ b/share/html/online/g_cluster.html
@@ -1,13 +1,13 @@
-<TITLE>g_cluster</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_cluster</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_cluster</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_cluster</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_cluster can cluster structures with several different methods.
  Distances between structures can be determined from a trajectory
@@ -34,7 +34,26 @@ file is supplied, the structure with
  the smallest average distance to the others or the average structure
  or all structures for each cluster will be written to a trajectory
  file. When writing all structures, separate numbered files are made
-for each cluster.
+for each cluster.<p>Two output files are always written:<br>
+<tt>-o</tt> writes the RMSD values in the upper left half of the matrix
+and a graphical depiction of the clusters in the lower right half
+(depends on <tt>-max</tt> and <tt>-keepfree</tt>).<br>
+<tt>-g</tt> writes information on the options used and a detailed list
+of all clusters and their members.<p>
+Additionally, a number of optional output files can be written:<br>
+<tt>-dist</tt> writes the RMSD distribution.<br>
+<tt>-ev</tt> writes the eigenvectors of the RMSD matrix
+diagonalization.<br>
+<tt>-sz</tt> writes the cluster sizes.<br>
+<tt>-tr</tt> writes a matrix of the number transitions between
+cluster pairs.<br>
+<tt>-ntr</tt> writes the total number of transitions to or from
+each cluster.<br>
+<tt>-clid</tt> writes the cluster number as a function of time.<br>
+<tt>-cl</tt> writes average (with option <tt>-av</tt>) or central
+structure of each cluster or writes numbered files with cluster members
+for a selected set of clusters (with option <tt>-wcl</tt>, depends on
+<tt>-nst</tt> and <tt>-rmsmin</tt>).<br>
  <P>
  <H3>Files</H3>
  <TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
diff --git a/share/html/online/g_confrms.html b/share/html/online/g_confrms.html

index 4f065639b38d3878fd892221884b6e76fcb23449..94ad23324be4a9f2ce9cee15cf886797ef05a085 100644 (file)
--- a/share/html/online/g_confrms.html
+++ b/share/html/online/g_confrms.html
@@ -1,13 +1,13 @@
-<TITLE>g_confrms</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_confrms</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_confrms</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_confrms</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_confrms computes the root mean square deviation (RMSD) of two
  structures after LSQ fitting the second structure on the first one.
diff --git a/share/html/online/g_covar.html b/share/html/online/g_covar.html

index 41c81fb08a54ae8b51df271beb27a6f274b89875..25bfe34f063fe45cc6f5a3e27fdd96605effca1e 100644 (file)
--- a/share/html/online/g_covar.html
+++ b/share/html/online/g_covar.html
@@ -1,13 +1,13 @@
-<TITLE>g_covar</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_covar</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_covar</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_covar</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  <tt>g_covar</tt> calculates and diagonalizes the (mass-weighted)
  covariance matrix.
@@ -41,9 +41,9 @@ The eigenvectors can be analyzed with <tt><a href="g_anaeig.html">g_anaeig</a></
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]fit</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>   yes</tt> </TD><TD> Fit to a reference structure </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]ref</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use the deviation from the conformation in the structure file instead of from the average </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]mwa</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Mass-weighted covariance analysis </TD></TD>
diff --git a/share/html/online/g_density.html b/share/html/online/g_density.html

index 8e93b1e0fd3a09cb8bd4c0955adccb9272893124..4f4e2e2cad6e151dd27972532f4663567a2ae508 100644 (file)
--- a/share/html/online/g_density.html
+++ b/share/html/online/g_density.html
@@ -1,13 +1,13 @@
-<TITLE>g_density</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_density</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_density</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_density</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  Compute partial densities across the box, using an index file. Densities
  in gram/cubic centimeter, number densities or electron densities can be
@@ -30,9 +30,9 @@ partial charge.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>Z</tt> </TD><TD> Take the normal on the membrane in direction X, Y or Z. </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>10</tt> </TD><TD> Divide the box in #nr slices. </TD></TD>
diff --git a/share/html/online/g_dielectric.html b/share/html/online/g_dielectric.html

index 6c7d551a53b1a50a826c2c5875b73cb399352e57..8e37688051d1c26d70a72ce6a5ab02c3c90cad0b 100644 (file)
--- a/share/html/online/g_dielectric.html
+++ b/share/html/online/g_dielectric.html
@@ -1,13 +1,13 @@
-<TITLE>g_dielectric</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dielectric</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dielectric</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dielectric</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  dielectric calculates frequency dependent dielectric constants
  from the autocorrelation function of the total dipole moment in
@@ -47,9 +47,9 @@ plot should be one half of a circle
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]fft</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> use fast fourier transform for correlation function </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]x1</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>   yes</tt> </TD><TD> use first column as X axis rather than first data set </TD></TD>
diff --git a/share/html/online/g_dih.html b/share/html/online/g_dih.html

index 5c88e65bc65508b9c8ece3f2c15b07ba8cd2b7ff..1f6a869e86b9443b84f018599b4dc723b8128db7 100644 (file)
--- a/share/html/online/g_dih.html
+++ b/share/html/online/g_dih.html
@@ -1,13 +1,13 @@
-<TITLE>g_dih</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dih</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dih</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dih</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_dih can do two things. The default is to analyze dihedral transitions
  by merely computing all the dihedral angles defined in your topology
@@ -32,9 +32,9 @@ conformations sorted according to occupancy.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]sa</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Perform cluster analysis in dihedral space instead of analysing dihedral transitions. </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-mult</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> mulitiplicity for dihedral angles (by default read from topology) </TD></TD>
diff --git a/share/html/online/g_dipoles.html b/share/html/online/g_dipoles.html

index fc431f93c282d1efddfe12c1f49a5e8b0d9b8b75..d77514c26cb3c66f16f1461a630e0a51c3f58f71 100644 (file)
--- a/share/html/online/g_dipoles.html
+++ b/share/html/online/g_dipoles.html
@@ -1,13 +1,13 @@
-<TITLE>g_dipoles</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dipoles</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dipoles</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dipoles</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_dipoles computes the total dipole plus fluctuations of a simulation
  system. From this you can compute e.g. the dielectric constant for
@@ -63,9 +63,9 @@ distribution function a maximum of 5.0 will be used.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-mu</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> dipole of a single molecule (in Debye) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-mumax</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>     5</tt> </TD><TD> max dipole in Debye (for histrogram) </TD></TD>
diff --git a/share/html/online/g_disre.html b/share/html/online/g_disre.html

index 8ceaaf5e076427d3f33c087e97aa1970e25c649e..22b8367ecf5b6cf5501056d6c0a7d42594de3484 100644 (file)
--- a/share/html/online/g_disre.html
+++ b/share/html/online/g_disre.html
@@ -1,13 +1,13 @@
-<TITLE>g_disre</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_disre</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_disre</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_disre</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_disre computes violations of distance restraints. If necessary
  all protons can be added to a protein molecule. The program allways
@@ -37,9 +37,9 @@ printing.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-ntop</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>6</tt> </TD><TD> Number of large violations that are stored in the log file every step </TD></TD>
  </TABLE>
diff --git a/share/html/online/g_dist.html b/share/html/online/g_dist.html

index c572f7f82fd01ec3cd414e92d15e6a0025731f2c..163552e816c14bf971ad17738a5b9e0599e27a51 100644 (file)
--- a/share/html/online/g_dist.html
+++ b/share/html/online/g_dist.html
@@ -1,13 +1,13 @@
-<TITLE>g_dist</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dist</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dist</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dist</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_dist can calculate the distance between the centers of mass of two
  groups of atoms as a function of time. The total distance and its
@@ -30,9 +30,9 @@ closer than a certain distance to the center of mass of group 1.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-dist</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>     0</tt> </TD><TD> Print all atoms in group 2 closer than dist to the center of mass of group 1 </TD></TD>
  </TABLE>
  <P>
diff --git a/share/html/online/g_dyndom.html b/share/html/online/g_dyndom.html

index 5d66cfb7c54d8a900788ae96de8b5912d031e42f..7c36db4319e313eba4da1276ce9ddb9b8a13616e 100644 (file)
--- a/share/html/online/g_dyndom.html
+++ b/share/html/online/g_dyndom.html
@@ -1,13 +1,13 @@
-<TITLE>g_dyndom</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_dyndom</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_dyndom</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_dyndom</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_dyndom reads a <a href="pdb.html">pdb</a> file output from DynDom
  http://md.chem.rug.nl/~steve/DynDom/dyndom.home.html
diff --git a/share/html/online/g_enemat.html b/share/html/online/g_enemat.html

index 70e1b752ae6cf0a258269222a348a24819936f3e..43e7febc096c0d5de1304b144be0aa8b2ca69fc2 100644 (file)
--- a/share/html/online/g_enemat.html
+++ b/share/html/online/g_enemat.html
@@ -1,13 +1,13 @@
-<TITLE>g_enemat</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_enemat</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_enemat</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_enemat</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_enemat extracts an energy matrix from an energy file.
  With <b>-groups</b> a file must be supplied with on each
@@ -40,9 +40,9 @@ in the comparison.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]sum</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Sum the energy terms selected rather than display them all </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-skip</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Skip number of frames between data points </TD></TD>
diff --git a/share/html/online/g_energy.html b/share/html/online/g_energy.html

index 1d4daf9c2e0618dde8a8bfb1fc6bd2337ae38d2d..5f9b2ac3a51acfa3cd932f1757c11803c8ffe597 100644 (file)
--- a/share/html/online/g_energy.html
+++ b/share/html/online/g_energy.html
@@ -1,13 +1,13 @@
-<TITLE>g_energy</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_energy</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_energy</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_energy</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_energy extracts energy components or distance restraint
  data from an energy file. The user is prompted to interactively
@@ -55,8 +55,8 @@ the energies must both be calculated from the same trajectory.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]fee</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Do a free energy estimate </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-fetemp</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>   300</tt> </TD><TD> Reference temperature for free energy calculation </TD></TD>
diff --git a/share/html/online/g_gyrate.html b/share/html/online/g_gyrate.html

index 0b6a0ac4c20fdb57a10068cd46c436d6e8ffdcd5..e58f2caaeb41bdc4930dfdbf5d5033d640c34446 100644 (file)
--- a/share/html/online/g_gyrate.html
+++ b/share/html/online/g_gyrate.html
@@ -1,13 +1,13 @@
-<TITLE>g_gyrate</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_gyrate</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_gyrate</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_gyrate</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_gyrate computes the radius of gyration of a group of atoms
  and the radii of gyration about the x, y and z axes,as a function of time. The atoms are explicitly mass weighted.
@@ -27,9 +27,9 @@ and the radii of gyration about the x, y and z axes,as a function of time. The a
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]q</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use absolute value of the charge of an atom as weighting factor instead of mass </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]p</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Calculate the radii of gyration about the principal axes. </TD></TD>
diff --git a/share/html/online/g_h2order.html b/share/html/online/g_h2order.html

index 748d03daa71325ec63ab75789cebe273c7c4537c..e05d4de67cb94de76d31c8eed7d3084478eb35dd 100644 (file)
--- a/share/html/online/g_h2order.html
+++ b/share/html/online/g_h2order.html
@@ -1,13 +1,13 @@
-<TITLE>g_h2order</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_h2order</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_h2order</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_h2order</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  Compute the orientation of water molecules with respect to the normal
  of the box. The program determines the average cosine of the angle
@@ -34,9 +34,9 @@ instead of the angle between the dipole and a box axis.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>Z</tt> </TD><TD> Take the normal on the membrane in direction X, Y or Z. </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Calculate order parameter as function of boxlength, dividing the box in #nr slices. </TD></TD>
diff --git a/share/html/online/g_hbond.html b/share/html/online/g_hbond.html

index 74721e17c8919d92b9b6d816e416f01930b9273e..27bdec1eb20bfda5e6028645e7128ea1cb650e84 100644 (file)
--- a/share/html/online/g_hbond.html
+++ b/share/html/online/g_hbond.html
@@ -1,13 +1,13 @@
-<TITLE>g_hbond</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_hbond</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_hbond</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_hbond</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_hbond computes and analyzes hydrogen bonds. Hydrogen bonds are
  determined based on cutoffs for the angle Donor - Hydrogen - Acceptor
@@ -81,9 +81,9 @@ each timeframe. This is especially usefull when using <tt>-shell</tt>.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]ins</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Analyze solvent insertion </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-a</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>    60</tt> </TD><TD> Cutoff angle (degrees, Donor - Hydrogen - Acceptor) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-r</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>  0.25</tt> </TD><TD> Cutoff radius (nm, Hydrogen - Acceptor) </TD></TD>
diff --git a/share/html/online/g_helix.html b/share/html/online/g_helix.html

index b8eb56e99958253ae768957a21b54b75bcdcabdd..410912d25a504cc25622330c1e3f92c129e38cab 100644 (file)
--- a/share/html/online/g_helix.html
+++ b/share/html/online/g_helix.html
@@ -1,13 +1,13 @@
-<TITLE>g_helix</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_helix</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_helix</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_helix</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_helix computes all kind of helix properties. First, the peptide
  is checked to find the longest helical part. This is determined by
@@ -57,9 +57,9 @@ atoms only (file rms-ahx.<a href="xvg.html">xvg</a>).<br>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-r0</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> The first residue number in the sequence </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]q</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Check at every step which part of the sequence is helical </TD></TD>
diff --git a/share/html/online/g_lie.html b/share/html/online/g_lie.html

index 715a6a2590085a1489cd25bdf465c87e9b57c947..407952308de635374350e0dfead6324837c938b9 100644 (file)
--- a/share/html/online/g_lie.html
+++ b/share/html/online/g_lie.html
@@ -1,13 +1,13 @@
-<TITLE>g_lie</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_lie</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_lie</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_lie</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_lie computes a free energy estimate based on an energy analysis
  from. One needs an energy file with the following components:
diff --git a/share/html/online/g_mdmat.html b/share/html/online/g_mdmat.html

index 211a47a734a7f849a5b85ba26fefd7656e80929e..dbba65169b11a1493bf1d45f685f12f500710215 100644 (file)
--- a/share/html/online/g_mdmat.html
+++ b/share/html/online/g_mdmat.html
@@ -1,13 +1,13 @@
-<TITLE>g_mdmat</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_mdmat</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_mdmat</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_mdmat</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_mdmat makes distance matrices consisting of the smallest distance
  between residue pairs. With -frames these distance matrices can be
@@ -37,9 +37,9 @@ The output can be processed with <a href="xpm2ps.html">xpm2ps</a> to make a Post
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-t</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>   1.5</tt> </TD><TD> trunc distance </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nlevels</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>40</tt> </TD><TD> Discretize distance in # levels </TD></TD>
  </TABLE>
diff --git a/share/html/online/g_mindist.html b/share/html/online/g_mindist.html

index 3171b31e806f1d68761322b4a4a6582fdf40bd3a..75764edc842cc35721bd5b4c4900b8ad2254f69a 100644 (file)
--- a/share/html/online/g_mindist.html
+++ b/share/html/online/g_mindist.html
@@ -1,13 +1,13 @@
-<TITLE>g_mindist</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_mindist</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_mindist</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_mindist</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_mindist computes the distance between one group and a number of
  other groups.
@@ -38,9 +38,9 @@ This option is very slow.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]matrix</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Calculate half a matrix of group-group distances </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>   0.6</tt> </TD><TD> Distance for contacts </TD></TD>
diff --git a/share/html/online/g_morph.html b/share/html/online/g_morph.html

index 9f40afccff97bdc219851d10739d2d7d26a3201c..de229ecfa782c7b595d4a7368a9861141a9b81d1 100644 (file)
--- a/share/html/online/g_morph.html
+++ b/share/html/online/g_morph.html
@@ -1,13 +1,13 @@
-<TITLE>g_morph</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_morph</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_morph</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_morph</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_morph does a linear interpolation of conformations in order to
  create intermediates. Of course these are completely unphysical, but
diff --git a/share/html/online/g_msd.html b/share/html/online/g_msd.html

index 5e9a904a5b372741b78868529476544246458a33..d94ba84cc3cac72902d4b7477275fac11991e2c0 100644 (file)
--- a/share/html/online/g_msd.html
+++ b/share/html/online/g_msd.html
@@ -1,13 +1,13 @@
-<TITLE>g_msd</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_msd</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_msd</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_msd</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_msd computes the mean square displacement (MSD) of atoms from
  their initial positions. This provides an easy way to compute
diff --git a/share/html/online/g_nmeig.html b/share/html/online/g_nmeig.html

index 17b1d93dfb6c102bc9c5b0fa2053414faa5b63a4..99aed1ac79bb9b496659501eaa0bd0814c1b259f 100644 (file)
--- a/share/html/online/g_nmeig.html
+++ b/share/html/online/g_nmeig.html
@@ -1,13 +1,13 @@
-<TITLE>g_nmeig</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_nmeig</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_nmeig</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_nmeig</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_nmeig calculates the eigenvectors/values of a (Hessian) matrix,
  which can be calculated with <tt><a href="nmrun.html">nmrun</a></tt>.
diff --git a/share/html/online/g_nmens.html b/share/html/online/g_nmens.html

index 7dcc2bb59f4c41923580dbc668e309a8418d7748..ef8910323d1c5f32abc6ab60dc25e0adaecca578 100644 (file)
--- a/share/html/online/g_nmens.html
+++ b/share/html/online/g_nmens.html
@@ -1,13 +1,13 @@
-<TITLE>g_nmens</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_nmens</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_nmens</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_nmens</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  <tt>g_nmens</tt> generates an ensemble around an average structure
  in a subspace which is defined by a set of normal modes (eigenvectors).
diff --git a/share/html/online/g_order.html b/share/html/online/g_order.html

index 6c5bf8e549404cb2c75ad790ff1de20095ef98b6..f20d0eaab4b40e2a4151610be0cc20ed7475f634 100644 (file)
--- a/share/html/online/g_order.html
+++ b/share/html/online/g_order.html
@@ -1,13 +1,13 @@
-<TITLE>g_order</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_order</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_order</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_order</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  Compute the order parameter per atom for carbon tails. For atom i the
  vector i-1, i+1 is used together with an axis. The index file has to contain
@@ -37,9 +37,9 @@ given.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>z</tt> </TD><TD> Direction of the normal on the membrane: z, x or y </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> Calculate order parameter as function of boxlength, dividing the box in #nr slices. </TD></TD>
diff --git a/share/html/online/g_potential.html b/share/html/online/g_potential.html

index 82c18ac25abd4db93d26ca1d443e9b353bedef2d..964a2027cee987893e74bf534fa97ccb7ad9c241 100644 (file)
--- a/share/html/online/g_potential.html
+++ b/share/html/online/g_potential.html
@@ -1,13 +1,13 @@
-<TITLE>g_potential</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_potential</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_potential</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_potential</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  Compute the electrostatical potential across the box. The potential iscalculated by first summing the charges per slice and then integratingtwice of this charge distribution. Periodic boundaries are not taken  into account. Reference of potential is taken to be the left side ofthe box. It's also possible to calculate the potential in sphericalcoordinates as function of r by calculating a charge distribution inspherical slices and twice integrating them. epsilon_r is taken as 1,2 is more appropriate in many cases
  <P>
@@ -28,9 +28,9 @@ Compute the electrostatical potential across the box. The potential iscalculated
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-d</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt>Z</tt> </TD><TD> Take the normal on the membrane in direction X, Y or Z. </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-sl</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>10</tt> </TD><TD> Calculate potential as function of boxlength, dividing the box in #nr slices. </TD></TD>
diff --git a/share/html/online/g_rama.html b/share/html/online/g_rama.html

index 22e8f4a44ed7e637f2db79422ed98581ccda5012..ac30ee62267bdd32f7b2191941fb21c9306c27a7 100644 (file)
--- a/share/html/online/g_rama.html
+++ b/share/html/online/g_rama.html
@@ -1,13 +1,13 @@
-<TITLE>g_rama</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rama</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rama</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rama</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_rama selects the Phi/Psi dihedral combinations from your topology file
  and computes these as a function of time.
@@ -28,9 +28,9 @@ specific residues.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  </TABLE>
  <P>
diff --git a/share/html/online/g_rdf.html b/share/html/online/g_rdf.html

index 5aad8e45564cf72495e5007ac82b45fd67e47a87..b1fa9e0e3865c74771834328bc12849232711dba 100644 (file)
--- a/share/html/online/g_rdf.html
+++ b/share/html/online/g_rdf.html
@@ -1,13 +1,13 @@
-<TITLE>g_rdf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rdf</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rdf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rdf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  The structure of liquids can be studied by either neutron or X-ray
  scattering. The most common way to describe liquid structure is by a
@@ -47,9 +47,9 @@ be computed (option <tt>-sq</tt>). The algorithm uses FFT, the gridspacing of wh
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-bin</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt> 0.001</tt> </TD><TD> Binwidth (nm) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]com</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> RDF with respect to the center of mass of first group </TD></TD>
diff --git a/share/html/online/g_rms.html b/share/html/online/g_rms.html

index d95c3c64bf4b2bf461c88b569c1a686b7c8c77cb..f5a2b8403fe7942f1ee08bb303c3fbb55983743d 100644 (file)
--- a/share/html/online/g_rms.html
+++ b/share/html/online/g_rms.html
@@ -1,13 +1,13 @@
-<TITLE>g_rms</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rms</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rms</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rms</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_rms compares two structures by computing the root mean square
  deviation (RMSD), the size-independent 'rho' similarity parameter
diff --git a/share/html/online/g_rmsdist.html b/share/html/online/g_rmsdist.html

index ca3bad9afafa7ead0fba8a1979aa3fafdb2b45f5..0e9e5b04f9268cfb7946687a31068000b5331c35 100644 (file)
--- a/share/html/online/g_rmsdist.html
+++ b/share/html/online/g_rmsdist.html
@@ -1,13 +1,13 @@
-<TITLE>g_rmsdist</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rmsdist</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rmsdist</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rmsdist</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_rmsdist computes the root mean square deviation of atom distances,
  which has the advantage that no fit is needed like in standard RMS
@@ -52,9 +52,9 @@ file, including case. Specifying non-sequential atoms is undefined.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nlevels</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>40</tt> </TD><TD> Discretize rms in # levels </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-max</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Maximum level in matrices </TD></TD>
diff --git a/share/html/online/g_rmsf.html b/share/html/online/g_rmsf.html

index 3af5cdce32b87a7e721af6a5fcb7f4215c3d0224..7bc6ad99f8f715371c5d5ce81d1fbe8a2e042530 100644 (file)
--- a/share/html/online/g_rmsf.html
+++ b/share/html/online/g_rmsf.html
@@ -1,13 +1,13 @@
-<TITLE>g_rmsf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rmsf</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rmsf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rmsf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_rmsf computes the root mean square fluctuation (RMSF, i.e. standard 
  deviation) of atomic positions 
@@ -54,9 +54,9 @@ the least.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]res</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Calculate averages for each residue </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]aniso</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Compute anisotropic termperature factors </TD></TD>
diff --git a/share/html/online/g_rotacf.html b/share/html/online/g_rotacf.html

index 87d52101b64c7a6e9ac2ccbe21fa65473b123efb..dc447a25998fe2cfe5d052d39a8862e0dd1192c9 100644 (file)
--- a/share/html/online/g_rotacf.html
+++ b/share/html/online/g_rotacf.html
@@ -1,13 +1,13 @@
-<TITLE>g_rotacf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_rotacf</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_rotacf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_rotacf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_rotacf calculates the rotational correlation function
  for molecules. Three atoms (i,j,k) must be given in the index
@@ -43,9 +43,9 @@ to a two parameter exponential
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]d</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use index doublets (vectors) for correlation function instead of triplets (planes) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]aver</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>   yes</tt> </TD><TD> Average over molecules </TD></TD>
diff --git a/share/html/online/g_saltbr.html b/share/html/online/g_saltbr.html

index c0ad20357386b10d2d3164f650e15f85723fbe25..55ef6f63b68cbb84abf39c363def4afd3a94bbc6 100644 (file)
--- a/share/html/online/g_saltbr.html
+++ b/share/html/online/g_saltbr.html
@@ -1,13 +1,13 @@
-<TITLE>g_saltbr</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_saltbr</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_saltbr</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_saltbr</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_saltbr plots the difference between all combination of charged groups
  as a function of time. The groups are combined in different ways.A minimum distance can be given, (eg. the cut-off), then groups
@@ -28,9 +28,9 @@ and plus-plus.<a href="xvg.html">xvg</a>, or files for every individual ion-pair
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-t</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>  1000</tt> </TD><TD> trunc distance </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]sep</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use separate files for each interaction (may be MANY) </TD></TD>
  </TABLE>
diff --git a/share/html/online/g_sas.html b/share/html/online/g_sas.html

index b3b0adc178d8be4a58d0590006ae79fd6c13553a..4d6faa7f08babc77a2379ffefb4bc8eff12674fa 100644 (file)
--- a/share/html/online/g_sas.html
+++ b/share/html/online/g_sas.html
@@ -1,13 +1,13 @@
-<TITLE>g_sas</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_sas</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_sas</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_sas</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_sas computes hydrophobic and total solvent accessible surface area.
  As a side effect the Connolly surface can be generated as well in
@@ -36,9 +36,9 @@ which can be used to restrain surface atoms.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-solsize</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>  0.14</tt> </TD><TD> Radius of the solvent probe (nm) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-ndots</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>24</tt> </TD><TD> Number of dots per sphere, more dots means more accuracy </TD></TD>
diff --git a/share/html/online/g_sgangle.html b/share/html/online/g_sgangle.html

index abb7ef45694294fd11d392379cd46fa9223d99a1..f384dde4542b7518cbe325ab7528aa18c1798c43 100644 (file)
--- a/share/html/online/g_sgangle.html
+++ b/share/html/online/g_sgangle.html
@@ -1,13 +1,13 @@
-<TITLE>g_sgangle</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_sgangle</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_sgangle</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_sgangle</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  Compute the angle and distance between two groups. 
  The groups are defined by a number of atoms given in an index file and
@@ -43,9 +43,9 @@ Here is what some of the file options do:<br>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  </TABLE>
  <P>
diff --git a/share/html/online/g_sorient.html b/share/html/online/g_sorient.html

index 8ca5688bb1344ab53e2bd4410c8dbfbbd35ac19a..4393b6e2502b5e3b51a3362b259b363e1a058810 100644 (file)
--- a/share/html/online/g_sorient.html
+++ b/share/html/online/g_sorient.html
@@ -1,13 +1,13 @@
-<TITLE>g_sorient</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_sorient</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_sorient</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_sorient</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_sorient analyzes solvent orientation around solutes.
  It calculates two angles between the vector from one or more
@@ -45,9 +45,9 @@ of cos(theta1) and 3cos^2(theta2)-1 as a function of r.<p>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]com</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use the center of mass as the reference postion </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-rmin</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>     0</tt> </TD><TD> Minimum distance </TD></TD>
diff --git a/share/html/online/g_tcaf.html b/share/html/online/g_tcaf.html

index c0083e1c71285204a370f8b18725725ad1fdb9a9..e2d84545a13fb9dc7dc1ce42950ad227795839b0 100644 (file)
--- a/share/html/online/g_tcaf.html
+++ b/share/html/online/g_tcaf.html
@@ -1,13 +1,13 @@
-<TITLE>g_tcaf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_tcaf</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_tcaf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_tcaf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_tcaf computes tranverse current autocorrelations.
  These are used to estimate the shear viscosity eta.
@@ -61,9 +61,9 @@ is very important for obtaining a good fit.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]mol</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Calculate tcaf of molecules </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]k34</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Also use k=(3,0,0) and k=(4,0,0) </TD></TD>
diff --git a/share/html/online/g_traj.html b/share/html/online/g_traj.html

index c64dded05f31e0fb8c5bd04ccf3c683fbf9b4193..6b88ad16ee538100df4b4f7f6d4cb99c4a1ef3ef 100644 (file)
--- a/share/html/online/g_traj.html
+++ b/share/html/online/g_traj.html
@@ -1,13 +1,13 @@
-<TITLE>g_traj</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_traj</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_traj</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_traj</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_traj plots coordinates, velocities, forces and/or the box.
  With <tt>-com</tt> the coordinates, velocities and forces are
@@ -40,9 +40,9 @@ This implies <tt>-com</tt>.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]com</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Plot data for the com of each group </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]mol</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Index contains molecule numbers iso atom numbers </TD></TD>
diff --git a/share/html/online/g_velacc.html b/share/html/online/g_velacc.html

index c47bb1dc2ecd863e98d81fcba1c18dd6604c65d2..fa3ad7702fd6d80a2c473bde8b939992baef2b0c 100644 (file)
--- a/share/html/online/g_velacc.html
+++ b/share/html/online/g_velacc.html
@@ -1,13 +1,13 @@
-<TITLE>g_velacc</TITLE>
+<HTML>/n<HEAD>/n<TITLE>g_velacc</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>g_velacc</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>g_velacc</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  g_velacc computes the velocity autocorrelation function.
  When the <tt>-s</tt> option is used, the momentum autocorrelation
@@ -31,9 +31,9 @@ of molecule numbers instead of atom numbers.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]mol</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Calculate vac of molecules </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-acflen</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>-1</tt> </TD><TD> Length of the ACF, default is half the number of frames </TD></TD>
diff --git a/share/html/online/genbox.html b/share/html/online/genbox.html

index e3de69ceb6467e93952f040ee37689eaa2ea4212..e0b4f4c66683721669d2526b33cbde1c885a750c 100644 (file)
--- a/share/html/online/genbox.html
+++ b/share/html/online/genbox.html
@@ -1,13 +1,13 @@
-<TITLE>genbox</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genbox</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genbox</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genbox</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  Genbox can do one of 3 things:<p>
  1) Generate a box of solvent. Specify -cs and -box. Or specify -cs and
diff --git a/share/html/online/genconf.html b/share/html/online/genconf.html

index 54af370fcf167784e46738667e1314b6e5e8448f..d12ffec0415402fafcd11ede1d701430dc623451 100644 (file)
--- a/share/html/online/genconf.html
+++ b/share/html/online/genconf.html
@@ -1,13 +1,13 @@
-<TITLE>genconf</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genconf</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genconf</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genconf</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  genconf multiplies a given coordinate file by simply stacking them
  on <a href="top.html">top</a> of each other, like a small child playing with wooden blocks.
diff --git a/share/html/online/genion.html b/share/html/online/genion.html

index 6705f2669d02d9b6a739f72673093c2db4bd0ffb..cc92f2c7fbdab1a03bd13eed0a04a9ba771455f6 100644 (file)
--- a/share/html/online/genion.html
+++ b/share/html/online/genion.html
@@ -1,13 +1,13 @@
-<TITLE>genion</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genion</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genion</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genion</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  genion replaces solvent molecules by monoatomic ions at
  the position of the first atoms with the most favorable electrostatic
diff --git a/share/html/online/genpr.html b/share/html/online/genpr.html

index 6cfad9ae9f498d3fb91638c213a9e6ad3b9ac9ac..e29381fcf162e7b88979b0a725028edc2da229e2 100644 (file)
--- a/share/html/online/genpr.html
+++ b/share/html/online/genpr.html
@@ -1,13 +1,13 @@
-<TITLE>genpr</TITLE>
+<HTML>/n<HEAD>/n<TITLE>genpr</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>genpr</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>genpr</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  genpr produces an include file for a topology containing
  a list of atom numbers and three force constants for the
diff --git a/share/html/online/gmxcheck.html b/share/html/online/gmxcheck.html

index cbdc291ef7276fa9c6d18753bd902b1afbeb1d97..e7fef1a97a9df2ab89c1fcf56a42d4ee328b8bbe 100644 (file)
--- a/share/html/online/gmxcheck.html
+++ b/share/html/online/gmxcheck.html
@@ -1,13 +1,13 @@
-<TITLE>gmxcheck</TITLE>
+<HTML>/n<HEAD>/n<TITLE>gmxcheck</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>gmxcheck</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>gmxcheck</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  gmxcheck reads a trajectory (<tt>.<a href="trj.html">trj</a></tt>, <tt>.<a href="trr.html">trr</a></tt> or 
  <tt>.<a href="xtc.html">xtc</a></tt>) or an energy file (<tt>.<a href="ene.html">ene</a></tt> or <tt>.<a href="edr.html">edr</a></tt>)
diff --git a/share/html/online/gmxdump.html b/share/html/online/gmxdump.html

index da2d0f2900b0fdc0c8d23573b46bfa246a563b15..5a7ea720e932a63007b38c2b66c441c92c248b41 100644 (file)
--- a/share/html/online/gmxdump.html
+++ b/share/html/online/gmxdump.html
@@ -1,13 +1,13 @@
-<TITLE>gmxdump</TITLE>
+<HTML>/n<HEAD>/n<TITLE>gmxdump</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>gmxdump</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>gmxdump</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  gmxdump reads a run input file (<tt>.<a href="tpa.html">tpa</a></tt>/<tt>.<a href="tpr.html">tpr</a></tt>/<tt>.<a href="tpb.html">tpb</a></tt>),
  a trajectory (<tt>.<a href="trj.html">trj</a></tt>/<tt>.<a href="trr.html">trr</a></tt>/<tt>.<a href="xtc.html">xtc</a></tt>) or an energy
diff --git a/share/html/online/grompp.html b/share/html/online/grompp.html

index 4f3f87f424781be1358e77cd9894248185fc140d..630cf0ee73888a6a21c59fed7d35a58f8977a0a0 100644 (file)
--- a/share/html/online/grompp.html
+++ b/share/html/online/grompp.html
@@ -1,13 +1,13 @@
-<TITLE>grompp</TITLE>
+<HTML>/n<HEAD>/n<TITLE>grompp</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>grompp</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>grompp</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  The gromacs preprocessor
  reads a molecular topology file, checks the validity of the
diff --git a/share/html/online/highway.html b/share/html/online/highway.html

index a1ca34f14c0b7b7d8a654e88ccf06f87b041e59e..37ea28ce7c6bcf17f3a53aff3b065963de2cd539 100644 (file)
--- a/share/html/online/highway.html
+++ b/share/html/online/highway.html
@@ -1,13 +1,13 @@
-<TITLE>highway</TITLE>
+<HTML>/n<HEAD>/n<TITLE>highway</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>highway</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>highway</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  highway is the gromacs highway simulator. It is an X-windows
  gadget that shows a (periodic) autobahn with a user defined
@@ -27,9 +27,9 @@ number of crashes. Nice for a background CPU-eater
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  </TABLE>
  <P>
  <hr>
diff --git a/share/html/online/make_ndx.html b/share/html/online/make_ndx.html

index e8c6ac12de45bc0b35f88766877419968dd07563..2cc18d8b79d32eabf8f1cdd19121bdcd9aa666f2 100644 (file)
--- a/share/html/online/make_ndx.html
+++ b/share/html/online/make_ndx.html
@@ -1,13 +1,13 @@
-<TITLE>make_ndx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>make_ndx</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>make_ndx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>make_ndx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  Index groups are necessary for almost every gromacs program.
  All these programs can generate default index groups. You ONLY
diff --git a/share/html/online/mdrun.html b/share/html/online/mdrun.html

index 0204bd988cd010c56f53786cc544bc8bf1bb716a..dc5a5cb98021ddf0010f5ffc79c4c334f83e1f73 100644 (file)
--- a/share/html/online/mdrun.html
+++ b/share/html/online/mdrun.html
@@ -1,13 +1,13 @@
-<TITLE>mdrun</TITLE>
+<HTML>/n<HEAD>/n<TITLE>mdrun</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>mdrun</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>mdrun</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  The mdrun program performs Molecular Dynamics simulations.
  It reads the run input file (<tt>-s</tt>) and distributes the
diff --git a/share/html/online/mk_angndx.html b/share/html/online/mk_angndx.html

index 1ce813bf9ef0c885a33bf9c688eea48deae18ae0..29773ca13321761f02ee3acb46c7e6439a043656 100644 (file)
--- a/share/html/online/mk_angndx.html
+++ b/share/html/online/mk_angndx.html
@@ -1,13 +1,13 @@
-<TITLE>mk_angndx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>mk_angndx</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>mk_angndx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>mk_angndx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  mk_angndx makes an index file for calculation of
  angle distributions etc. It uses a run input file (<tt>.tpx</tt>) for the
diff --git a/share/html/online/ngmx.html b/share/html/online/ngmx.html

index 5307a6f40357b04ae2e406274035dc7c51a4ed74..676f83021e242eb80ce8383394b2d772afa40b1f 100644 (file)
--- a/share/html/online/ngmx.html
+++ b/share/html/online/ngmx.html
@@ -1,13 +1,13 @@
-<TITLE>ngmx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>ngmx</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>ngmx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>ngmx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  ngmx is the Gromacs trajectory viewer. This program reads a
  trajectory file, a run input file and an index file and plots a
@@ -38,9 +38,9 @@ Some of the more common X command line options can be used:<br>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  </TABLE>
  <P>
  <H3>Diagnostics</H3>
diff --git a/share/html/online/nmrun.html b/share/html/online/nmrun.html

index 8094a545f7b3fbe0b7021079ee74e126b8e4db37..22723cf97b4637243c62491552ba967d1086ed81 100644 (file)
--- a/share/html/online/nmrun.html
+++ b/share/html/online/nmrun.html
@@ -1,13 +1,13 @@
-<TITLE>nmrun</TITLE>
+<HTML>/n<HEAD>/n<TITLE>nmrun</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>nmrun</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>nmrun</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  nmrun builds a Hessian matrix from single conformation.
  For usual Normal Modes-like calculations, make sure that
diff --git a/share/html/online/pdb2gmx.html b/share/html/online/pdb2gmx.html

index 12742d43eb662602523d2c0abd3f251fd90e2d49..be27342703508b07c1189b73581c76cbfced3714 100644 (file)
--- a/share/html/online/pdb2gmx.html
+++ b/share/html/online/pdb2gmx.html
@@ -1,13 +1,13 @@
-<TITLE>pdb2gmx</TITLE>
+<HTML>/n<HEAD>/n<TITLE>pdb2gmx</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>pdb2gmx</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>pdb2gmx</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  This program reads a <a href="pdb.html">pdb</a> file, lets you choose a forcefield, reads
  some database files, adds hydrogens to the molecules and generates
diff --git a/share/html/online/protonate.html b/share/html/online/protonate.html

index c4ed4cea257948bdf0a1ef02ab292ce1b8179cb7..abaec4c4627a89503fa6bb15285ef49c3b328cc7 100644 (file)
--- a/share/html/online/protonate.html
+++ b/share/html/online/protonate.html
@@ -1,13 +1,13 @@
-<TITLE>protonate</TITLE>
+<HTML>/n<HEAD>/n<TITLE>protonate</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>protonate</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>protonate</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  <tt>protonate</tt> reads (a) conformation(s) and adds all missing
  hydrogens as defined in <tt>ffgmx2.<a href="hdb.html">hdb</a></tt>. If only <tt>-s</tt> is
@@ -37,9 +37,9 @@ should correspond to the <b>protonated</b> state.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  </TABLE>
  <P>
  <hr>
diff --git a/share/html/online/style.css b/share/html/online/style.css

index e7d76894b02b00c04a63c898ca01d0cb76e4b461..532769837995bf2d2df9363d4c504669ffcbdb58 100644 (file)
--- a/share/html/online/style.css
+++ b/share/html/online/style.css
@@ -1,10 +1,21 @@
-P          { text-indent: 0em; font-family: helvetica,verdana,arial,sans-serif }
+th         { font-family: arial,helvetica,verdana,sans-serif }
+P          { text-indent: 0em; font-family: arial,helvetica,verdana,sans-serif }
  H1         { text-indent: 0em; font-size: 24pt; font-family: serif }
-H2         { text-indent: 0em; font-size: 24pt; font-weight: bold; font-family: helvetica,verdana,arial,sans-serif }
-H3         { text-indent: 0em; font-size: 18pt; font-weight: bold; font-family: helvetica,verdana,arial,sans-serif }
-A:link     { text-decoration: none; font-family: helvetica,verdana,arial,sans-serif  }
-A:active   { text-decoration: none; font-family: helvetica,verdana,arial,sans-serif  }
-A:visited  { text-decoration: none; font-family: helvetica,verdana,arial,sans-serif  } 
-body       { text-indent: 0em; font-family: helvetica,verdana,arial,sans-serif }
-td         { font-family: helvetica,verdana,arial,sans-serif }
-th         { font-family: helvetica,verdana,arial,sans-serif }
+H2         { text-indent: 0em; font-size: 24pt; font-weight: bold; font-family: arial,helvetica,verdana,sans-serif }
+H3         { text-indent: 0em; font-size: 18pt; font-weight: bold; font-family: arial,helvetica,verdana,sans-serif }
+A:link     { text-decoration: none; font-family: arial,helvetica,verdana,sans-serif  }
+A:active   { text-decoration: none; font-family: arial,helvetica,verdana,sans-serif  }
+A:visited  { text-decoration: none; font-family: arial,helvetica,verdana,sans-serif  } 
+body       { text-indent: 0em; font-family: arial,helvetica,verdana,sans-serif }
+td         { font-family: arial,helvetica,verdana,sans-serif }
+th         { font-family: arial,helvetica,verdana,sans-serif }
+li         { font-family: arial,helvetica,verdana,sans-serif }
+ul         { font-family: arial,helvetica,verdana,sans-serif }
+tt         { font-family: courier,"lucida console",serif }
+
+
+
+
+
+
+
diff --git a/share/html/online/tpbconv.html b/share/html/online/tpbconv.html

index bedfcd20f3b9abdfaa6ff41011570c15017e11cb..9fd9d5e4402afbe34c50a734773c3970150fee04 100644 (file)
--- a/share/html/online/tpbconv.html
+++ b/share/html/online/tpbconv.html
@@ -1,13 +1,13 @@
-<TITLE>tpbconv</TITLE>
+<HTML>/n<HEAD>/n<TITLE>tpbconv</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>tpbconv</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>tpbconv</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  tpbconv can edit run input files in two ways.<p><b>1st.</b> by creating a run input file
  for a continuation run when your simulation has crashed due to e.g.
diff --git a/share/html/online/trjcat.html b/share/html/online/trjcat.html

index 36a08a0101a4179b6568fbcc121ae3dbf9417cd6..8647e467ef2f88dafaeebadd73fea4f9843d4e0b 100644 (file)
--- a/share/html/online/trjcat.html
+++ b/share/html/online/trjcat.html
@@ -1,13 +1,13 @@
-<TITLE>trjcat</TITLE>
+<HTML>/n<HEAD>/n<TITLE>trjcat</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>trjcat</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>trjcat</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  trjcat concatenates several input trajectory files in sorted order. 
  In case of double time frames the one in the later file is used. 
diff --git a/share/html/online/trjconv.html b/share/html/online/trjconv.html

index ff471438f39d9d589a4862dabe6664a31d1cfb1b..c1f6149869b9119ffc5619c58d484b949dbd9206 100644 (file)
--- a/share/html/online/trjconv.html
+++ b/share/html/online/trjconv.html
@@ -1,13 +1,13 @@
-<TITLE>trjconv</TITLE>
+<HTML>/n<HEAD>/n<TITLE>trjconv</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>trjconv</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>trjconv</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  trjconv can convert trajectory files in many ways:<br>
  <b>1.</b> from one format to another<br>
@@ -134,6 +134,7 @@ one specific time from your trajectory.
  <TR><TD ALIGN=RIGHT> <b><tt>-trunc</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Truncate input <a href="trj.html">trj</a> file after this time (ps) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-exec</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt></tt> </TD><TD> Execute command for every output frame with the frame number as argument </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]app</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Append output </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-split</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>     0</tt> </TD><TD> Start writing new file when t MOD split = first time (ps) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]sep</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Write each frame to a separate .<a href="gro.html">gro</a> or .<a href="pdb.html">pdb</a> file </TD></TD>
  </TABLE>
  <P>
diff --git a/share/html/online/trjorder.html b/share/html/online/trjorder.html

index 0a5b4c58e32088e666b706560d09159960a4b36f..01c9e6533b55179ba1544839af1c20dbb8c665d7 100644 (file)
--- a/share/html/online/trjorder.html
+++ b/share/html/online/trjorder.html
@@ -1,13 +1,13 @@
-<TITLE>trjorder</TITLE>
+<HTML>/n<HEAD>/n<TITLE>trjorder</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>trjorder</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>trjorder</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  trjorder orders molecules according to the smallest distance
  to atoms in a reference group. It will ask for a group of reference
@@ -38,9 +38,9 @@ with any Gromacs program to analyze the n closest waters.
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-na</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>3</tt> </TD><TD> Number of atoms in a molecule </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-da</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> Atom used for the distance calculation </TD></TD>
  </TABLE>
diff --git a/share/html/online/wheel.html b/share/html/online/wheel.html

index 412dc5be0c5bbaebd99ee70d586922e6edb62f69..03c5d20b5a18b4d465ee32512c534d6bc499df2f 100644 (file)
--- a/share/html/online/wheel.html
+++ b/share/html/online/wheel.html
@@ -1,13 +1,13 @@
-<TITLE>wheel</TITLE>
+<HTML>/n<HEAD>/n<TITLE>wheel</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>wheel</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>wheel</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  wheel plots a helical wheel representation of your sequence.The input sequence is in the .<a href="dat.html">dat</a> file where the first line contains
  the number of residues and each consecutive line contains a residuename.
diff --git a/share/html/online/x2top.html b/share/html/online/x2top.html

index 7c6970f0e6c1e1115a6e0fc664f9890c77e8b24f..9b4efe6bb959963ba0f4848544941016676ec486 100644 (file)
--- a/share/html/online/x2top.html
+++ b/share/html/online/x2top.html
@@ -1,13 +1,13 @@
-<TITLE>x2top</TITLE>
+<HTML>/n<HEAD>/n<TITLE>x2top</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>x2top</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>x2top</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  x2top generates a primitive topology from a coordinate file.
  The program assumes all hydrogens are present when defining
diff --git a/share/html/online/xmdrun.html b/share/html/online/xmdrun.html

new file mode 100644 (file)

index 0000000..fd902ab
--- /dev/null
+++ b/share/html/online/xmdrun.html
@@ -0,0 +1,111 @@
+<HTML>/n<HEAD>/n<TITLE>xmdrun</TITLE>
+<LINK rel=stylesheet href="style.css" type="text/css">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
+<TR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>xmdrun</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
+<H3>Description</H3>
+xmdrun is the experimental MD program. New features are tested in this
+program before being implemented in the default <a href="mdrun.html">mdrun</a>. Currently under
+investigation are: polarizibility, glass simulations, 
+Free energy perturbation, X-Ray bombardments
+and parallel independent simulations.It reads the run input file (<tt>-s</tt>) and distributes the
+topology over nodes if needed. The coordinates are passed
+around, so that computations can begin.
+First a neighborlist is made, then the forces are computed.
+The forces are globally summed, and the velocities and
+positions are updated. If necessary shake is performed to constrain
+bond lengths and/or bond angles.
+Temperature and Pressure can be controlled using weak coupling to a
+bath.<p>
+<a href="mdrun.html">mdrun</a> produces at least three output file, plus one log file
+(<tt>-g</tt>) per node.
+The trajectory file (<tt>-o</tt>), contains coordinates, velocities and
+optionally forces.
+The structure file (<tt>-c</tt>) contains the coordinates and
+velocities of the last step.
+The energy file (<tt>-e</tt>) contains energies, the temperature,
+pressure, etc, a lot of these things are also printed in the log file
+of node 0.
+Optionally coordinates can be written to a compressed trajectory file
+(<tt>-x</tt>).<p>
+When running in parallel with PVM or an old version of MPI the
+<tt>-np</tt> option must be given to indicate the number of
+nodes.<p>
+The option <tt>-dgdl</tt> is only used when free energy perturbation is
+turned on.<p>
+With <tt>-rerun</tt> an input trajectory can be given for which 
+forces and energies will be (re)calculated. Neighbor searching will be
+performed for every frame, unless <tt>nstlist</tt> is zero
+(see the <tt>.<a href="mdp.html">mdp</a></tt> file).<p>
+ED (essential dynamics) sampling is switched on by using the <tt>-ei</tt>
+flag followed by an <tt>.<a href="edi.html">edi</a></tt> file.
+The <tt>.<a href="edi.html">edi</a></tt> file can be produced using options in the essdyn
+menu of the WHAT IF program. <a href="mdrun.html">mdrun</a> produces a <tt>.<a href="edo.html">edo</a></tt> file that
+contains projections of positions, velocities and forces onto selected
+eigenvectors.<p>
+The -table option can be used to pass <a href="mdrun.html">mdrun</a> a formatted table with
+user-defined potential functions. The file is read from either the
+current directory or from the GMXLIB directory. A number of preformatted
+tables are presented in the GMXLIB dir, for 6-8, 6-9, 6-10, 6-11, 6-12
+Lennard Jones potentials with normal Coulomb.<p>
+The options <tt>-pi</tt>, <tt>-po</tt>, <tt>-pd</tt>, <tt>-pn</tt> are used
+for potential of mean force calculations and umbrella sampling.
+See manual.<p>
+When <a href="mdrun.html">mdrun</a> receives a TERM signal, it will set nsteps to the current
+step plus one. When <a href="mdrun.html">mdrun</a> receives a USR1 signal, it will set nsteps
+to the next multiple of nstxout after the current step.
+In both cases all the usual output will be written to file.
+When running with MPI, a signal to one of the <a href="mdrun.html">mdrun</a> processes
+is sufficient, this signal should not be sent to mpirun or
+the <a href="mdrun.html">mdrun</a> process that is the parent of the others.
+<P>
+<H3>Files</H3>
+<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
+<TR><TH>option</TH><TH>filename</TH><TH>type</TH><TH>description</TH></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-s</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">   topol.tpr</a></tt> </TD><TD> Input </TD><TD> Generic run input: <a href="tpr.html">tpr</a> <a href="tpb.html">tpb</a> <a href="tpa.html">tpa</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-o</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">    traj.trr</a></tt> </TD><TD> Output </TD><TD> Full precision trajectory: <a href="trr.html">trr</a> <a href="trj.html">trj</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-x</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xtc.html">    traj.xtc</a></tt> </TD><TD> Output, Opt. </TD><TD> Compressed trajectory (portable xdr format) </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-c</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html"> confout.gro</a></tt> </TD><TD> Output </TD><TD> Generic structure: <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">    ener.edr</a></tt> </TD><TD> Output </TD><TD> Generic energy: <a href="edr.html">edr</a> <a href="ene.html">ene</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-g</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="log.html">      md.log</a></tt> </TD><TD> Output </TD><TD> Log file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-dgdl</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html">    dgdl.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-table</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html">   table.xvg</a></tt> </TD><TD> Input, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-rerun</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="files.html">   rerun.xtc</a></tt> </TD><TD> Input, Opt. </TD><TD> Generic trajectory: <a href="xtc.html">xtc</a> <a href="trr.html">trr</a> <a href="trj.html">trj</a> <a href="gro.html">gro</a> <a href="g96.html">g96</a> <a href="pdb.html">pdb</a> </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-ei</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="edi.html">     sam.edi</a></tt> </TD><TD> Input, Opt. </TD><TD> ED sampling input </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-eo</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="edo.html">     sam.edo</a></tt> </TD><TD> Output, Opt. </TD><TD> ED sampling output </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-j</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="gct.html">    wham.gct</a></tt> </TD><TD> Input, Opt. </TD><TD> General coupling stuff </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-jo</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="gct.html">     bam.gct</a></tt> </TD><TD> Input, Opt. </TD><TD> General coupling stuff </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-ffout</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html">     gct.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-devout</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html">deviatie.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-runav</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="xvg.html"> runaver.xvg</a></tt> </TD><TD> Output, Opt. </TD><TD> xvgr/xmgr file </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-pi</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="ppa.html">    pull.ppa</a></tt> </TD><TD> Input, Opt. </TD><TD> Pull parameters </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-po</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="ppa.html"> pullout.ppa</a></tt> </TD><TD> Output, Opt. </TD><TD> Pull parameters </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-pd</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="pdo.html">    pull.pdo</a></tt> </TD><TD> Output, Opt. </TD><TD> Pull data output </TD></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-pn</tt></b> </TD><TD ALIGN=RIGHT> <tt><a href="ndx.html">    pull.ndx</a></tt> </TD><TD> Input, Opt. </TD><TD> Index file </TD></TR>
+</TABLE>
+<P>
+<H3>Other options</H3>
+<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>
+<TR><TH>option</TH><TH>type</TH><TH>default</TH><TH>description</TH></TR>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>19</tt> </TD><TD> Set the nicelevel </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-deffnm</tt></b> </TD><TD ALIGN=RIGHT> string </TD><TD ALIGN=RIGHT> <tt></tt> </TD><TD> Set the default filename for all file options </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]v</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Be loud and noisy </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]compact</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>   yes</tt> </TD><TD> Write a compact log file </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]multi</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Do multiple simulations in parallel (only with -np &gt; 1) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]glas</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Do glass simulation with special long range corrections </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]ionize</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Do a simulation including the effect of an X-Ray bombardment on your system </TD></TD>
+</TABLE>
+<P>
+<hr>
+<div ALIGN=RIGHT>
+<font size="-1"><a href="http://www.gromacs.org">http://www.gromacs.org</a></font><br>
+<font size="-1"><a href="mailto:gromacs@gromacs.org">gromacs@gromacs.org</a></font><br>
+</div>
+</BODY>
diff --git a/share/html/online/xpm2ps.html b/share/html/online/xpm2ps.html

index cc6aa3f71b4de2ffb0c2978095f4edf56ed6e378..f3df001857c770a9505ab152fa099b2c36b1ceb3 100644 (file)
--- a/share/html/online/xpm2ps.html
+++ b/share/html/online/xpm2ps.html
@@ -1,13 +1,13 @@
-<TITLE>xpm2ps</TITLE>
+<HTML>/n<HEAD>/n<TITLE>xpm2ps</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>xpm2ps</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>xpm2ps</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  xpm2ps makes a beautiful color plot of an XPixelMap file.
  Labels and axis can be displayed, when they are supplied
@@ -26,7 +26,10 @@ first one (<tt>-f</tt>) is plotted together with the lower right
  half of the second one (<tt>-f2</tt>). The diagonal will contain
  values from the matrix file selected with <tt>-diag</tt>.
  Plotting of the diagonal values can be suppressed altogether by
-setting <tt>-diag</tt> to <tt>none</tt>.<p>
+setting <tt>-diag</tt> to <tt>none</tt>. With 
+<tt>-combine</tt> an alternative operation can be selected to combine
+the matrices. In this case, a new color map will be generated with
+a red gradient for negative numbers and a blue for positive.<p>
  If the color coding and legend labels of both matrices are identical,
  only one legend will be displayed, else two separate legends are
  displayed.<p>
@@ -57,12 +60,15 @@ the <tt>-<a href="xpm.html">xpm</a></tt> option.
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]w</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> View output <a href="xvg.html">xvg</a>, <a href="xpm.html">xpm</a>, <a href="eps.html">eps</a> and <a href="pdb.html">pdb</a> files </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]frame</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>   yes</tt> </TD><TD> Display frame, ticks, labels, title and legend </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-title</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>top</tt> </TD><TD> Show title at: <a href="top.html">top</a>, ylabel or none </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-title</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>top</tt> </TD><TD> Show title at: <a href="top.html">top</a>, once, ylabel or none </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-[no]yonce</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Show y-label only once </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-legend</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>both</tt> </TD><TD> Show legend: both, first, second or none </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-diag</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>first</tt> </TD><TD> Diagonal: first, second or none </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-combine</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>halves</tt> </TD><TD> Combine two matrices: halves, add, sub, mult or div </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-bx</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>     0</tt> </TD><TD> Box x-size (also y-size when -by is not set) </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-by</tt></b> </TD><TD ALIGN=RIGHT> real </TD><TD ALIGN=RIGHT> <tt>     0</tt> </TD><TD> Box y-size </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-rainbow</tt></b> </TD><TD ALIGN=RIGHT> enum </TD><TD ALIGN=RIGHT> <tt>no</tt> </TD><TD> Rainbow colors, convert white to: no, blue or red </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-gradient</tt></b> </TD><TD ALIGN=RIGHT> vector </TD><TD ALIGN=RIGHT> <tt>0 0 0</tt> </TD><TD> Re-scale colormap to a smooth gradient from white {1,1,1} to {r,g,b} </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-skip</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>1</tt> </TD><TD> only write out every nr-th row and column </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]zeroline</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> insert line in <a href="xpm.html">xpm</a> matrix where axis label is zero </TD></TD>
  </TABLE>
diff --git a/share/html/online/xrama.html b/share/html/online/xrama.html

index ae41d2d7eef43baddbedd007261ac4361178f179..9964630adf39e5d71b12db3f409b34b7eba82abf 100644 (file)
--- a/share/html/online/xrama.html
+++ b/share/html/online/xrama.html
@@ -1,13 +1,13 @@
-<TITLE>xrama</TITLE>
+<HTML>/n<HEAD>/n<TITLE>xrama</TITLE>
  <LINK rel=stylesheet href="style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>xrama</H2>
-<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH="98%">
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+<table WIDTH="800" NOBORDER >
  <TR>
-<TD><font size=-1><A HREF="../online.html">Main Table of Contents</A></font></TD>
-<TD ALIGN=RIGHT><B>VERSION 3.0</B></TR>
-<TR><TD><font size=-1><A HREF="http://www.gromacs.org">GROMACS homepage</A></font></TD>
-<TD ALIGN=RIGHT><B>Tue 15 May 2001</B></TR></TABLE></CENTER><HR>
+<td WIDTH="120" HEIGHT="133">
+<a href="http://www.gromacs.org/"><img SRC="../gif/gmxlogo_small.jpg"BORDER=0 height=133 width=116></a></td><td ALIGN=LEFT VALIGN=TOP WIDTH=480><br><br><h2>Online Reference:<br>xrama</h2><font size=-1><A HREF="../online.html">Main Table of Contents</A></font><br><br></td>
+<TD ALIGN=RIGHT VALIGN=BOTTOM><B>VERSION 3.0<br>
+Mon 11 Jun 2001</B></td></tr></TABLE>
+<HR>
  <H3>Description</H3>
  xrama shows a Ramachandran movie, that is, it shows
  the Phi/Psi angles as a function of time in an X-Window.<p>Static Phi/Psi plots for printing can be made with <a href="g_rama.html">g_rama</a>.<p>
@@ -27,9 +27,9 @@ Some of the more common X command line options can be used:<br>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]h</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Print help info and quit </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-[no]X</tt></b> </TD><TD ALIGN=RIGHT> bool </TD><TD ALIGN=RIGHT> <tt>    no</tt> </TD><TD> Use dialog box GUI to edit command line options </TD></TD>
  <TR><TD ALIGN=RIGHT> <b><tt>-nice</tt></b> </TD><TD ALIGN=RIGHT> int </TD><TD ALIGN=RIGHT> <tt>0</tt> </TD><TD> Set the nicelevel </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (ps) to read from trajectory </TD></TD>
-<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (ps) </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-b</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> First frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-e</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Last frame (fs) to read from trajectory </TD></TD>
+<TR><TD ALIGN=RIGHT> <b><tt>-dt</tt></b> </TD><TD ALIGN=RIGHT> time </TD><TD ALIGN=RIGHT> <tt>    -1</tt> </TD><TD> Only use frame when t MOD dt = first time (fs) </TD></TD>
  </TABLE>
  <P>
  <hr>
diff --git a/src/Makefile.inc b/src/Makefile.inc

index a4d293d15c7864bc8b4311957ad21abb8427dad9..67808edda2ec186c82afb4d2b0b6f143dcb7fe40 100644 (file)
--- a/src/Makefile.inc
+++ b/src/Makefile.inc
@@ -3,21 +3,6 @@
  # Don't edit - this file is generated automatically from Makefile.am
  #
  
-## Let's rule ;-)
-
-# The asm suffix is for intel syntax assembly, and
-# the s suffix for at & t syntax.
-# S suffix files will be preprocessed by cpp, nasm
-# on the other hand can do this directly.
-
-SUFFIXES = .asm .S .F
-.asm.o:
-       $(NASM) $(NASMFLAGS) $< -o $@
-.S.s:
-       $(CPP) $< > $@
-.F.f:  
-       $(CPP) $< > $@
-
  #################
  # We need a second compile command producing executables 
  # that can be run on the local host to make the innerloops.
@@ -25,12 +10,11 @@ SUFFIXES = .asm .S .F
  # for cross-compilation. We also need it on parallel machines 
  # where the MPI executables cannot be run outside a batch queue.
  
+BUILD_COMPILE = $(BUILD_CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+#################
  # This might be bad - but I don't know any other way to enable
  # us to type make <progname> in subdirs right now
  #AM_CPPFLAGS = -DHAVE_CONFIG_H
  
-BUILD_COMPILE = $(BUILD_CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-
-
-
  
diff --git a/src/contrib/Makefile.am b/src/contrib/Makefile.am

index 2c3a467693d61ea6dcc506a53551e8c6c0c7034e..d3acac6ad5572eea70ab2d927f13a9f0daf53455 100644 (file)
--- a/src/contrib/Makefile.am
+++ b/src/contrib/Makefile.am
@@ -10,7 +10,9 @@ include $(srcdir)/../Makefile.inc
  # but it can probably be done in a nicer way...
  INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
  LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib 
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
+
+EXTRA_DIST              = README  programs.txt  gromacs-3.0.spec
  
  # NB: The programs in contrib do not get double/mpi suffixes automatically,
  # to make it easier for new developers to understand this file. If you want
diff --git a/src/contrib/gromacs-3.0.spec b/src/contrib/gromacs-3.0.spec

new file mode 100644 (file)

index 0000000..65aa7da
--- /dev/null
+++ b/src/contrib/gromacs-3.0.spec
@@ -0,0 +1,702 @@
+#
+# RPM specification file to make gromacs packages, version 3.0
+# Presently, you cannot relocate from /usr/local/gromacs.
+#
+# Usage:
+#
+# 1. Start from a gromacs distribution tarball, made
+#    with "make dist". Put it in the RPM
+#    source directory (usually /usr/src/redhat/SOURCES).
+# 2. Edit the version and release info below (bump the
+#    release every time you release a new rpm, restore it
+#    to 1 for each a new version.)
+# 3. Edit the files tags IF YOU MOVE OR ADD ANY FILES
+#    (also if you change lib versions)
+# 4. This file assumes a i686-pc-linux-gnu configuration -
+#    you will have to change that for a different host,
+#    since it enters in the directory names gromacs creates.
+# 5. cd to /usr/src/redhat/SPECS and issue 
+#    rpm -ba gromacs-3.0.spec
+#
+#    That's it - you should have both binary and source rpms now.
+#
+
+#
+# Main package - only dynamic libs, and no header files
+#
+Summary: A package for molecular dynamics simulation 
+Name: gromacs
+Version: 3.0
+Release: 1
+Copyright: GPL
+Group: Applications/Science
+Source: http://www.gromacs.org/download/gromacs_source/gromacs-3.0.tar.gz
+URL: http://www.gromacs.org
+Packager: Erik Lindahl <lindahl@gromacs.org>
+%description
+GROMACS is a versatile and extremely well optimized package
+to perform molecular dynamics computer simulations and
+subsequent trajectory analysis. It is developed for
+biomolecules like proteins, but the extremely high 
+performance means it is used also in several other field 
+like polymer chemistry and solid state physics. This
+version has the dynamic libs and executables; to hack new
+utility programs you also need the headers and static
+libs in gromacs-dev. Linux kernel 2.4 is STRONGLY
+recommended on Pentium III and later processors since
+GROMACS can then use assembly loops with SSE instructions.
+#
+# The header files and static libraries go into gromacs-devel...
+#
+%package devel
+Summary: Header files and static libraries for GROMACS
+Group: Applications/Science
+Requires: gromacs = %{version}-%{release}
+%description devel
+This package contains header files, static libraries,
+and a program example for the GROMACS molecular
+dynamics software. You need it if you want to write your
+own analysis programs.
+
+
+%prep
+%setup
+
+%build
+./configure
+
+%install
+make install
+make links
+
+%post
+#
+# Add our library dir to /etc/ld.so.conf if it is not already there
+#
+if test -z `grep /usr/local/gromacs/lib/i686-pc-linux-gnu /etc/ld.so.conf`; then
+     cat >> /etc/ld.so.conf < /usr/local/gromacs/lib/i686-pc-linux-gnu
+fi
+
+# run ldconfig to update the runtime linker database with the new libraries
+# (make sure /sbin is in the $PATH)
+PATH="/sbin:$PATH" ldconfig
+
+%postun
+#
+# Remove gromacs lib dir from /etc/ld.so.conf, since nothing else resides there
+# 
+grep -v /usr/local/gromacs/lib/i686-pc-linux-gnu /etc/ld.so.conf > tmpconf
+mv tmpconf /etc/ld.so.conf
+
+# after uninstall, run ldconfig to remove the libs from the linker database
+PATH="/sbin:$PATH" ldconfig
+
+
+
+
+%files
+# binaries
+/usr/local/gromacs/bin/i686-pc-linux-gnu/average    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_confrms     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_gyrate  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_order      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_order
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_order
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_sorient  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/highway    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/trjconv
+/usr/local/gromacs/bin/i686-pc-linux-gnu/do_dssp    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_covar       
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_h2order  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_potential  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_tcaf     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/luck       
+/usr/local/gromacs/bin/i686-pc-linux-gnu/trjorder
+/usr/local/gromacs/bin/i686-pc-linux-gnu/editconf   
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_density     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_hbond    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rama       
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_traj     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/make_ndx   
+/usr/local/gromacs/bin/i686-pc-linux-gnu/wheel
+/usr/local/gromacs/bin/i686-pc-linux-gnu/eneconv    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dielectric  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_helix    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rdf        
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_velacc   
+/usr/local/gromacs/bin/i686-pc-linux-gnu/mdrun      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/x2top
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_anaeig   
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dih         
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_lie      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rms        
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genbox     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/mk_angndx  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/xmdrun
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_analyze  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dipoles     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_mdmat    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rmsdist    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genconf    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/ngmx       
+/usr/local/gromacs/bin/i686-pc-linux-gnu/xpm2ps
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_angle    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_disre       
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_mindist  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rmsf       
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genion     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/nmrun      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/xrama
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_bond     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dist        
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_morph    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_rotacf     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/genpr      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/pdb2gmx
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_bundle   
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_dyndom      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_msd      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_saltbr     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/gmxcheck   
+/usr/local/gromacs/bin/i686-pc-linux-gnu/protonate
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_chi      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_enemat      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_nmeig    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_sas        
+/usr/local/gromacs/bin/i686-pc-linux-gnu/gmxdump    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/tpbconv
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_cluster  
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_energy      
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_nmens    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/g_sgangle    
+/usr/local/gromacs/bin/i686-pc-linux-gnu/grompp     
+/usr/local/gromacs/bin/i686-pc-linux-gnu/trjcat
+#links to /usr/local/bin
+/usr/local/bin/average    
+/usr/local/bin/g_confrms     
+/usr/local/bin/g_gyrate  
+/usr/local/bin/g_order      
+/usr/local/bin/g_order
+/usr/local/bin/g_order
+/usr/local/bin/g_sorient  
+/usr/local/bin/highway    
+/usr/local/bin/trjconv
+/usr/local/bin/do_dssp    
+/usr/local/bin/g_covar       
+/usr/local/bin/g_h2order  
+/usr/local/bin/g_potential  
+/usr/local/bin/g_tcaf     
+/usr/local/bin/luck       
+/usr/local/bin/trjorder
+/usr/local/bin/editconf   
+/usr/local/bin/g_density     
+/usr/local/bin/g_hbond    
+/usr/local/bin/g_rama       
+/usr/local/bin/g_traj     
+/usr/local/bin/make_ndx   
+/usr/local/bin/wheel
+/usr/local/bin/eneconv    
+/usr/local/bin/g_dielectric  
+/usr/local/bin/g_helix    
+/usr/local/bin/g_rdf        
+/usr/local/bin/g_velacc   
+/usr/local/bin/mdrun      
+/usr/local/bin/x2top
+/usr/local/bin/g_anaeig   
+/usr/local/bin/g_dih         
+/usr/local/bin/g_lie      
+/usr/local/bin/g_rms        
+/usr/local/bin/genbox     
+/usr/local/bin/mk_angndx  
+/usr/local/bin/xmdrun
+/usr/local/bin/g_analyze  
+/usr/local/bin/g_dipoles     
+/usr/local/bin/g_mdmat    
+/usr/local/bin/g_rmsdist    
+/usr/local/bin/genconf    
+/usr/local/bin/ngmx       
+/usr/local/bin/xpm2ps
+/usr/local/bin/g_angle    
+/usr/local/bin/g_disre       
+/usr/local/bin/g_mindist  
+/usr/local/bin/g_rmsf       
+/usr/local/bin/genion     
+/usr/local/bin/nmrun      
+/usr/local/bin/xrama
+/usr/local/bin/g_bond     
+/usr/local/bin/g_dist        
+/usr/local/bin/g_morph    
+/usr/local/bin/g_rotacf     
+/usr/local/bin/genpr      
+/usr/local/bin/pdb2gmx
+/usr/local/bin/g_bundle   
+/usr/local/bin/g_dyndom      
+/usr/local/bin/g_msd      
+/usr/local/bin/g_saltbr     
+/usr/local/bin/gmxcheck   
+/usr/local/bin/protonate
+/usr/local/bin/g_chi      
+/usr/local/bin/g_enemat      
+/usr/local/bin/g_nmeig    
+/usr/local/bin/g_sas        
+/usr/local/bin/gmxdump    
+/usr/local/bin/tpbconv
+/usr/local/bin/g_cluster  
+/usr/local/bin/g_energy      
+/usr/local/bin/g_nmens    
+/usr/local/bin/g_sgangle    
+/usr/local/bin/grompp     
+/usr/local/bin/trjcat
+# the topology library
+/usr/local/gromacs/top/
+/usr/local/gromacs/top/FF.dat
+/usr/local/gromacs/top/ffgmx.itp
+/usr/local/gromacs/top/ffgmxnb.itp
+/usr/local/gromacs/top/ffgmxbon.itp
+/usr/local/gromacs/top/ffgmx.atp
+/usr/local/gromacs/top/ffgmx.hdb
+/usr/local/gromacs/top/ffgmx.n2t
+/usr/local/gromacs/top/ffgmx.rtp
+/usr/local/gromacs/top/ffgmx-c.tdb
+/usr/local/gromacs/top/ffgmx-n.tdb
+/usr/local/gromacs/top/ffgmx2.itp
+/usr/local/gromacs/top/ffgmx2nb.itp
+/usr/local/gromacs/top/ffgmx2bon.itp
+/usr/local/gromacs/top/ffgmx2.atp
+/usr/local/gromacs/top/ffgmx2.hdb
+/usr/local/gromacs/top/ffgmx2.rtp
+/usr/local/gromacs/top/ffgmx2-c.tdb
+/usr/local/gromacs/top/ffgmx2-n.tdb
+/usr/local/gromacs/top/ffG43a1.itp
+/usr/local/gromacs/top/ffG43a1nb.itp
+/usr/local/gromacs/top/ffG43a1bon.itp
+/usr/local/gromacs/top/ffG43a1.atp
+/usr/local/gromacs/top/ffG43a1.hdb
+/usr/local/gromacs/top/ffG43a1.rtp
+/usr/local/gromacs/top/ffG43a1-c.tdb
+/usr/local/gromacs/top/ffG43a1-n.tdb
+/usr/local/gromacs/top/ffG43a2.itp
+/usr/local/gromacs/top/ffG43a2nb.itp
+/usr/local/gromacs/top/ffG43a2bon.itp
+/usr/local/gromacs/top/ffG43a2.atp
+/usr/local/gromacs/top/ffG43a2.hdb
+/usr/local/gromacs/top/ffG43a2.rtp
+/usr/local/gromacs/top/ffG43a2-c.tdb
+/usr/local/gromacs/top/ffG43a2-n.tdb
+/usr/local/gromacs/top/ffG43b1.itp
+/usr/local/gromacs/top/ffG43b1nb.itp
+/usr/local/gromacs/top/ffG43b1bon.itp
+/usr/local/gromacs/top/ffG43b1.atp
+/usr/local/gromacs/top/ffG43b1.hdb
+/usr/local/gromacs/top/ffG43b1.rtp
+/usr/local/gromacs/top/ffG43b1-c.tdb
+/usr/local/gromacs/top/ffG43b1-n.tdb
+/usr/local/gromacs/top/1mlg.itp
+/usr/local/gromacs/top/2mlg.itp
+/usr/local/gromacs/top/benzamide.itp
+/usr/local/gromacs/top/bondadd.itp
+/usr/local/gromacs/top/buck.itp
+/usr/local/gromacs/top/decane.itp
+/usr/local/gromacs/top/dlg.itp
+/usr/local/gromacs/top/dmso.itp
+/usr/local/gromacs/top/fa.itp
+/usr/local/gromacs/top/ff_dum.itp
+/usr/local/gromacs/top/flexspc.itp
+/usr/local/gromacs/top/flexspce.itp
+/usr/local/gromacs/top/flexwat-ferguson.itp
+/usr/local/gromacs/top/h2p4o13.itp
+/usr/local/gromacs/top/h2p8o25.itp
+/usr/local/gromacs/top/h2po4.itp
+/usr/local/gromacs/top/ions.itp
+/usr/local/gromacs/top/methanol.itp
+/usr/local/gromacs/top/spc.itp
+/usr/local/gromacs/top/spce.itp
+/usr/local/gromacs/top/tfe.itp
+/usr/local/gromacs/top/tip3pgmx.itp
+/usr/local/gromacs/top/tip4pgmx.itp
+/usr/local/gromacs/top/urea.itp
+/usr/local/gromacs/top/dec50.gro
+/usr/local/gromacs/top/dmso.gro
+/usr/local/gromacs/top/spc216.gro
+/usr/local/gromacs/top/tip4p.gro
+/usr/local/gromacs/top/urea+h2o.gro
+/usr/local/gromacs/top/aminoacids.dat
+/usr/local/gromacs/top/atommass.dat
+/usr/local/gromacs/top/bromacs.dat
+/usr/local/gromacs/top/ca-shift.dat
+/usr/local/gromacs/top/cb-shift.dat
+/usr/local/gromacs/top/co-shift.dat
+/usr/local/gromacs/top/edissoc.dat
+/usr/local/gromacs/top/gurgle.dat
+/usr/local/gromacs/top/ha-shift.dat
+/usr/local/gromacs/top/links.dat
+/usr/local/gromacs/top/phbres.dat
+/usr/local/gromacs/top/random.dat
+/usr/local/gromacs/top/refi_aa.dat
+/usr/local/gromacs/top/specbond.dat
+/usr/local/gromacs/top/surface.dat
+/usr/local/gromacs/top/vdwradii.dat
+/usr/local/gromacs/top/xlateat.dat
+/usr/local/gromacs/top/export.dlg
+/usr/local/gromacs/top/bonds.dlg
+/usr/local/gromacs/top/ss.map
+/usr/local/gromacs/top/ps.m2p
+/usr/local/gromacs/top/table6-10.xvg
+/usr/local/gromacs/top/table6-11.xvg
+/usr/local/gromacs/top/table6-12.xvg
+/usr/local/gromacs/top/table6-8.xvg
+/usr/local/gromacs/top/table6-9.xvg
+# examples
+/usr/local/gromacs/share/tutor/cleanit
+/usr/local/gromacs/share/tutor/gmxdemo/cpeptide.pdb
+/usr/local/gromacs/share/tutor/gmxdemo/demo
+/usr/local/gromacs/share/tutor/gmxdemo/demo
+/usr/local/gromacs/share/tutor/nmr1/conf.gro
+/usr/local/gromacs/share/tutor/nmr1/grompp.mdp
+/usr/local/gromacs/share/tutor/nmr1/pep.pdb
+/usr/local/gromacs/share/tutor/nmr1/topol.top
+/usr/local/gromacs/share/tutor/nmr2/conf.gro
+/usr/local/gromacs/share/tutor/nmr2/grompp.mdp
+/usr/local/gromacs/share/tutor/nmr2/pep.pdb
+/usr/local/gromacs/share/tutor/nmr2/topol.top
+/usr/local/gromacs/share/tutor/nmr2/genconf.gcp
+/usr/local/gromacs/share/tutor/water/water.top
+/usr/local/gromacs/share/tutor/water/water.mdp
+/usr/local/gromacs/share/tutor/water/spc216.gro
+/usr/local/gromacs/share/tutor/water/spc216.pdb
+/usr/local/gromacs/share/tutor/water/oxygen.ndx
+/usr/local/gromacs/share/tutor/speptide/speptide.pdb
+/usr/local/gromacs/share/tutor/speptide/pr.mdp
+/usr/local/gromacs/share/tutor/speptide/em.mdp
+/usr/local/gromacs/share/tutor/speptide/full.mdp
+# manual pages
+/usr/local/gromacs/man/
+/usr/local/gromacs/man/man1/
+/usr/local/gromacs/man/man1/g_dih.1
+/usr/local/gromacs/man/man1/g_msd.1
+/usr/local/gromacs/man/man1/g_tcaf.1
+/usr/local/gromacs/man/man1/nmrun.1
+/usr/local/gromacs/man/man1/do_dssp.1
+/usr/local/gromacs/man/man1/g_dipoles.1
+/usr/local/gromacs/man/man1/g_nmeig.1
+/usr/local/gromacs/man/man1/g_traj.1
+/usr/local/gromacs/man/man1/pdb2gmx.1
+/usr/local/gromacs/man/man1/editconf.1
+/usr/local/gromacs/man/man1/g_disre.1
+/usr/local/gromacs/man/man1/g_nmens.1
+/usr/local/gromacs/man/man1/g_velacc.1
+/usr/local/gromacs/man/man1/protonate.1
+/usr/local/gromacs/man/man1/eneconv.1
+/usr/local/gromacs/man/man1/g_dist.1
+/usr/local/gromacs/man/man1/g_order.1
+/usr/local/gromacs/man/man1/genbox.1
+/usr/local/gromacs/man/man1/tpbconv.1
+/usr/local/gromacs/man/man1/g_anaeig.1
+/usr/local/gromacs/man/man1/g_dyndom.1
+/usr/local/gromacs/man/man1/g_potential.1
+/usr/local/gromacs/man/man1/genconf.1
+/usr/local/gromacs/man/man1/trjcat.1
+/usr/local/gromacs/man/man1/g_analyze.1
+/usr/local/gromacs/man/man1/g_enemat.1
+/usr/local/gromacs/man/man1/g_rama.1
+/usr/local/gromacs/man/man1/genion.1
+/usr/local/gromacs/man/man1/trjconv.1
+/usr/local/gromacs/man/man1/g_angle.1
+/usr/local/gromacs/man/man1/g_energy.1
+/usr/local/gromacs/man/man1/g_rdf.1
+/usr/local/gromacs/man/man1/genpr.1
+/usr/local/gromacs/man/man1/trjorder.1
+/usr/local/gromacs/man/man1/g_bond.1
+/usr/local/gromacs/man/man1/g_gyrate.1
+/usr/local/gromacs/man/man1/g_rms.1
+/usr/local/gromacs/man/man1/gmxcheck.1
+/usr/local/gromacs/man/man1/wheel.1
+/usr/local/gromacs/man/man1/g_bundle.1
+/usr/local/gromacs/man/man1/g_h2order.1
+/usr/local/gromacs/man/man1/g_rmsdist.1
+/usr/local/gromacs/man/man1/gmxdump.1
+/usr/local/gromacs/man/man1/x2top.1
+/usr/local/gromacs/man/man1/g_chi.1
+/usr/local/gromacs/man/man1/g_hbond.1
+/usr/local/gromacs/man/man1/g_rmsf.1
+/usr/local/gromacs/man/man1/grompp.1
+/usr/local/gromacs/man/man1/xpm2ps.1
+/usr/local/gromacs/man/man1/g_cluster.1
+/usr/local/gromacs/man/man1/g_helix.1
+/usr/local/gromacs/man/man1/g_rotacf.1
+/usr/local/gromacs/man/man1/highway.1
+/usr/local/gromacs/man/man1/xrama.1
+/usr/local/gromacs/man/man1/g_confrms.1
+/usr/local/gromacs/man/man1/g_lie.1
+/usr/local/gromacs/man/man1/g_saltbr.1
+/usr/local/gromacs/man/man1/make_ndx.1
+/usr/local/gromacs/man/man1/g_covar.1
+/usr/local/gromacs/man/man1/g_mdmat.1
+/usr/local/gromacs/man/man1/g_sas.1
+/usr/local/gromacs/man/man1/mdrun.1
+/usr/local/gromacs/man/man1/g_density.1
+/usr/local/gromacs/man/man1/g_mindist.1
+/usr/local/gromacs/man/man1/g_sgangle.1
+/usr/local/gromacs/man/man1/mk_angndx.1
+/usr/local/gromacs/man/man1/g_morph.1
+/usr/local/gromacs/man/man1/g_sorient.1
+/usr/local/gromacs/man/man1/ngmx.1
+/usr/local/gromacs/man/man1/g_dielectric.1
+# html pages
+/usr/local/gromacs/html/
+/usr/local/gromacs/html/gmxfaq.html
+/usr/local/gromacs/html/online.html
+/usr/local/gromacs/html/gif/
+/usr/local/gromacs/html/gif/annealdn.gif
+/usr/local/gromacs/html/gif/features.gif
+/usr/local/gromacs/html/gif/flow_leftrightup.gif
+/usr/local/gromacs/html/gif/flow_vrule.gif
+/usr/local/gromacs/html/gif/annealup.gif
+/usr/local/gromacs/html/gif/flow_down.gif
+/usr/local/gromacs/html/gif/flow_leftup.gif
+/usr/local/gromacs/html/gif/links.gif
+/usr/local/gromacs/html/gif/articles.gif
+/usr/local/gromacs/html/gif/flow_downleft.gif
+/usr/local/gromacs/html/gif/flow_right+left.gif
+/usr/local/gromacs/html/gif/mail.gif
+/usr/local/gromacs/html/gif/bench.gif
+/usr/local/gromacs/html/gif/flow_hline.gif
+/usr/local/gromacs/html/gif/flow_right.gif
+/usr/local/gromacs/html/gif/manual.gif
+/usr/local/gromacs/html/gif/charts_down.gif
+/usr/local/gromacs/html/gif/flow_left.gif
+/usr/local/gromacs/html/gif/flow_rightleftdown.gif
+/usr/local/gromacs/html/gif/rainbow.gif
+/usr/local/gromacs/html/gif/charts_up.gif
+/usr/local/gromacs/html/gif/flow_leftright.gif
+/usr/local/gromacs/html/gif/flow_uprightleft.gif
+/usr/local/gromacs/html/gif/software.gif
+/usr/local/gromacs/html/gif/faq.gif
+/usr/local/gromacs/html/gif/flow_leftrightdown.gif
+/usr/local/gromacs/html/gif/flow_vline.gif
+/usr/local/gromacs/html/gif/topologies.gif
+/usr/local/gromacs/html/gif/plotje.gif
+/usr/local/gromacs/html/gif/xvgr.gif
+/usr/local/gromacs/html/online/
+/usr/local/gromacs/html/online/edo.html
+/usr/local/gromacs/html/online/g96.html
+/usr/local/gromacs/html/online/log.html
+/usr/local/gromacs/html/online/options.html
+/usr/local/gromacs/html/online/tpa.html
+/usr/local/gromacs/html/online/xvg.html
+/usr/local/gromacs/html/online/edr.html
+/usr/local/gromacs/html/online/m2p.html
+/usr/local/gromacs/html/online/getting_started.html
+/usr/local/gromacs/html/online/out.html
+/usr/local/gromacs/html/online/tpb.html
+/usr/local/gromacs/html/online/ene.html
+/usr/local/gromacs/html/online/gro.html
+/usr/local/gromacs/html/online/map.html
+/usr/local/gromacs/html/online/tpr.html
+/usr/local/gromacs/html/online/eps.html
+/usr/local/gromacs/html/online/hat.html
+/usr/local/gromacs/html/online/mdp.html
+/usr/local/gromacs/html/online/xtc.html
+/usr/local/gromacs/html/online/top.html
+/usr/local/gromacs/html/online/pdb.html
+/usr/local/gromacs/html/online/trj.html
+/usr/local/gromacs/html/online/dat.html
+/usr/local/gromacs/html/online/files.html
+/usr/local/gromacs/html/online/mdp_opt.html
+/usr/local/gromacs/html/online/rtp.html
+/usr/local/gromacs/html/online/include_bot.html
+/usr/local/gromacs/html/online/trr.html
+/usr/local/gromacs/html/online/dlg.html
+/usr/local/gromacs/html/online/flow.html
+/usr/local/gromacs/html/online/mtx.html
+/usr/local/gromacs/html/online/tex.html
+/usr/local/gromacs/html/online/include_top.html
+/usr/local/gromacs/html/online/xpm.html
+/usr/local/gromacs/html/online/edi.html
+/usr/local/gromacs/html/online/g87.html
+/usr/local/gromacs/html/online/itp.html
+/usr/local/gromacs/html/online/ndx.html
+/usr/local/gromacs/html/online/style.css
+/usr/local/gromacs/html/style.css
+# dynamic libraries
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.so.1.0.0
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.so.1.0.0
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.so.1
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.so.1
+
+#
+# The header files and static libraries go into gromacs-dev...
+#
+
+%files dev
+# include headers
+/usr/local/gromacs/include/
+/usr/local/gromacs/include/3dview.h
+/usr/local/gromacs/include/do_md.h
+/usr/local/gromacs/include/invblock.h
+/usr/local/gromacs/include/nrjac.h
+/usr/local/gromacs/include/rwtop.h
+/usr/local/gromacs/include/tpxio.h
+/usr/local/gromacs/include/assert.h
+/usr/local/gromacs/include/do_nm.h
+/usr/local/gromacs/include/javaio.h
+/usr/local/gromacs/include/nrnb.h
+/usr/local/gromacs/include/sheader.h
+/usr/local/gromacs/include/transfer.h
+/usr/local/gromacs/include/atomprop.h
+/usr/local/gromacs/include/dummies.h
+/usr/local/gromacs/include/list.h
+/usr/local/gromacs/include/ns.h
+/usr/local/gromacs/include/shift.h
+/usr/local/gromacs/include/trnio.h
+/usr/local/gromacs/include/axp_asm.h
+/usr/local/gromacs/include/ebin.h
+/usr/local/gromacs/include/macros.h
+/usr/local/gromacs/include/nsb.h
+/usr/local/gromacs/include/shift_util.h
+/usr/local/gromacs/include/txtdump.h
+/usr/local/gromacs/include/binio.h
+/usr/local/gromacs/include/edsam.h
+/usr/local/gromacs/include/magic.h
+/usr/local/gromacs/include/nsgrid.h
+/usr/local/gromacs/include/sim_util.h
+/usr/local/gromacs/include/typedefs.h
+/usr/local/gromacs/include/block_tx.h
+/usr/local/gromacs/include/enxio.h
+/usr/local/gromacs/include/main.h
+/usr/local/gromacs/include/pbc.h
+/usr/local/gromacs/include/smalloc.h
+/usr/local/gromacs/include/update.h
+/usr/local/gromacs/include/bondf.h
+/usr/local/gromacs/include/ewald.h
+/usr/local/gromacs/include/maths.h
+/usr/local/gromacs/include/pdbio.h
+/usr/local/gromacs/include/sortwater.h
+/usr/local/gromacs/include/utils.h
+/usr/local/gromacs/include/buffer.h
+/usr/local/gromacs/include/ewald_util.h
+/usr/local/gromacs/include/matio.h
+/usr/local/gromacs/include/pdebug.h
+/usr/local/gromacs/include/split.h
+/usr/local/gromacs/include/vcm.h
+/usr/local/gromacs/include/calcgrid.h
+/usr/local/gromacs/include/fatal.h
+/usr/local/gromacs/include/mdatoms.h
+/usr/local/gromacs/include/physics.h
+/usr/local/gromacs/include/vec.h
+/usr/local/gromacs/include/calch.h
+/usr/local/gromacs/include/ffscanf.h
+/usr/local/gromacs/include/mdebin.h
+/usr/local/gromacs/include/pme.h
+/usr/local/gromacs/include/statusio.h
+/usr/local/gromacs/include/viewit.h
+/usr/local/gromacs/include/calcmu.h
+/usr/local/gromacs/include/fftgrid.h
+/usr/local/gromacs/include/mdrun.h
+/usr/local/gromacs/include/pppm.h
+/usr/local/gromacs/include/statutil.h
+/usr/local/gromacs/include/vveclib.h
+/usr/local/gromacs/include/callf77.h
+/usr/local/gromacs/include/fftw_wrapper.h
+/usr/local/gromacs/include/memdump.h
+/usr/local/gromacs/include/princ.h
+/usr/local/gromacs/include/steep.h
+/usr/local/gromacs/include/wgms.h
+/usr/local/gromacs/include/filenm.h
+/usr/local/gromacs/include/memtab.h
+/usr/local/gromacs/include/pull.h
+/usr/local/gromacs/include/strdb.h
+/usr/local/gromacs/include/wman.h
+/usr/local/gromacs/include/comlib.h
+/usr/local/gromacs/include/force.h
+/usr/local/gromacs/include/memtest.h
+/usr/local/gromacs/include/string2.h
+/usr/local/gromacs/include/writeps.h
+/usr/local/gromacs/include/complex.h
+/usr/local/gromacs/include/futil.h
+/usr/local/gromacs/include/metacode.h
+/usr/local/gromacs/include/random.h
+/usr/local/gromacs/include/struc2.h
+/usr/local/gromacs/include/x86_3dnow.h
+/usr/local/gromacs/include/comtest.h
+/usr/local/gromacs/include/gbutil.h
+/usr/local/gromacs/include/mpiio.h
+/usr/local/gromacs/include/rbin.h
+/usr/local/gromacs/include/superb.h
+/usr/local/gromacs/include/x86_cpu.h
+/usr/local/gromacs/include/tgroup.h
+/usr/local/gromacs/include/general.h
+/usr/local/gromacs/include/mshift.h
+/usr/local/gromacs/include/rdgroup.h
+/usr/local/gromacs/include/symtab.h
+/usr/local/gromacs/include/x86_sse.h
+/usr/local/gromacs/include/confio.h
+/usr/local/gromacs/include/gmxfio.h
+/usr/local/gromacs/include/mvdata.h
+/usr/local/gromacs/include/rdklib.h
+/usr/local/gromacs/include/sync.h
+/usr/local/gromacs/include/xdrf.h
+/usr/local/gromacs/include/constr.h
+/usr/local/gromacs/include/grompp.h
+/usr/local/gromacs/include/names.h
+/usr/local/gromacs/include/readcomp.h
+/usr/local/gromacs/include/synclib.h
+/usr/local/gromacs/include/xtcio.h
+/usr/local/gromacs/include/copyrite.h
+/usr/local/gromacs/include/gstat.h
+/usr/local/gromacs/include/network.h
+/usr/local/gromacs/include/readinp.h
+/usr/local/gromacs/include/sysstuff.h
+/usr/local/gromacs/include/xvgr.h
+/usr/local/gromacs/include/delay.h
+/usr/local/gromacs/include/index.h
+/usr/local/gromacs/include/nhash.h
+/usr/local/gromacs/include/renum.h
+/usr/local/gromacs/include/systest.h
+/usr/local/gromacs/include/disre.h
+/usr/local/gromacs/include/init.h
+/usr/local/gromacs/include/nr.h
+/usr/local/gromacs/include/reorder.h
+/usr/local/gromacs/include/tags.h
+/usr/local/gromacs/include/do_fit.h
+/usr/local/gromacs/include/nrama.h
+/usr/local/gromacs/include/rmpbc.h
+/usr/local/gromacs/include/types/
+/usr/local/gromacs/include/types/atoms.h
+/usr/local/gromacs/include/types/edsams.h
+/usr/local/gromacs/include/types/forcerec.h
+/usr/local/gromacs/include/types/ifunc.h
+/usr/local/gromacs/include/types/mdatom.h
+/usr/local/gromacs/include/types/nsborder.h
+/usr/local/gromacs/include/types/simple.h
+/usr/local/gromacs/include/types/block.h
+/usr/local/gromacs/include/types/energy.h
+/usr/local/gromacs/include/types/graph.h
+/usr/local/gromacs/include/types/inputrec.h
+/usr/local/gromacs/include/types/nblist.h
+/usr/local/gromacs/include/types/nsgrid.h
+/usr/local/gromacs/include/types/symtab.h
+/usr/local/gromacs/include/types/commrec.h
+/usr/local/gromacs/include/types/enums.h
+/usr/local/gromacs/include/types/group.h
+/usr/local/gromacs/include/types/ishift.h
+/usr/local/gromacs/include/types/nbslist.h
+/usr/local/gromacs/include/types/parm.h
+/usr/local/gromacs/include/types/topology.h
+/usr/local/gromacs/include/types/drblock.h
+/usr/local/gromacs/include/types/filenm.h
+/usr/local/gromacs/include/types/idef.h
+/usr/local/gromacs/include/types/matrix.h
+/usr/local/gromacs/include/types/nrnb.h
+/usr/local/gromacs/include/types/pulls.h
+/usr/local/gromacs/include/types/trx.h
+/usr/local/gromacs/share/template/template.c
+/usr/local/gromacs/share/template/README
+/usr/local/gromacs/share/template/Makefile
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.a
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.a
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.la
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.la
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libgmx.so
+/usr/local/gromacs/lib/i686-pc-linux-gnu/libmd.so
+
diff --git a/src/contrib/programs.txt b/src/contrib/programs.txt

index 0921d5d2dd33e04e6d63ed49b9d27dd0ccf3e8f6..c02582ca073725393a80f4c1095e3ca072168a78 100644 (file)
--- a/src/contrib/programs.txt
+++ b/src/contrib/programs.txt
@@ -13,6 +13,7 @@ HEAD|Running a simulation
  grompp|makes a run input file
  tpbconv|makes a run input file for restarting a crashed run
  mdrun|performs a simulation
+xmdrun|performs simulations with extra experimental features
  END
  
  HEAD|Viewing trajectories
diff --git a/src/contrib/scripts/Makefile.am b/src/contrib/scripts/Makefile.am

index 84c368ba8b6c839f8e909f02c7cc40194850b70e..bad7242a3105ad3335c926dfe6075c466c932d35 100644 (file)
--- a/src/contrib/scripts/Makefile.am
+++ b/src/contrib/scripts/Makefile.am
@@ -7,6 +7,7 @@
  
  EXTRA_DIST =   grompplog2top   make_gromos_nb.pl       make_gromos_rtp.py  \
                 mkhtml          mkonline                make_gromos_bon.pl  \
-               mkcompl         mknroff                 make_gromos_rtp.pl  
+               mkcompl         mknroff                 make_gromos_rtp.pl  \
+               mktex           GMXRC                   NOGMX
  
  
diff --git a/src/contrib/scripts/mkhtml b/src/contrib/scripts/mkhtml

index 036bb40c3afabfdce728a4e1c701957a1c5026b1..8510757f380910a2c94a2688f0f2b6047857200c 100755 (executable)
--- a/src/contrib/scripts/mkhtml
+++ b/src/contrib/scripts/mkhtml
@@ -35,12 +35,31 @@ touch $HTMLIDX
  
  cat > $HTMLIDX << EOD
  <HTML>
+<HEAD>
  <TITLE>GROMACS $VER Online Reference </TITLE>
+</HEAD>
  <LINK rel=stylesheet href="online/style.css" type="text/css">
-<BODY text="#000000" bgcolor="#FFFFFF" link="#0000EF" vlink="#650065" alink="#FF0000">
-<H2>GROMACS $VER Online Reference</H2>
-<HR>
-<P>
+<BODY text="#000000" bgcolor="#FFFFFF" link="#0000FF" vlink="#990000" alink="#FF0000">
+
+<table WIDTH="800" NOSAVE NOBORDER >
+<tr NOSAVE>
+<td WIDTH="120" HEIGHT="140" NOSAVE><a href="http://www.gromacs.org/"><img SRC="
+gif/gmxlogo_small.jpg" BORDER=0 height=133 width=116></a></td>
+
+<td ALIGN=LEFT VALIGN=TOP WIDTH=480 NOSAVE>
+<br><br>
+<h2>
+GROMACS 3.0<br>
+Online Reference</h2>
+</td>
+<td ALIGN=RIGHT VALIGN=BOTTOM WIDTH=200 NOSAVE>
+<B>VERSION 3.0<br>
+Tue 15 May 2001</B></td>
+</tr>
+</table>
+
+<hr>
+
  <TABLE BORDER=0 CELLSPACING=0 CELLPADDING=10>
  <TR>
  <TD VALIGN=top WIDTH="25%">
@@ -55,7 +74,6 @@ end
  cat >> $HTMLIDX <<EOD
  <A HREF="gmxfaq.html">FAQ</a>
  <br>
-<br><A HREF="http://www.gromacs.org">GROMACS homepage</A>
  </TD>
  <TD VALIGN=top WIDTH=75%>
  <h3>Programs</h3>
diff --git a/src/gmxlib/Makefile.am b/src/gmxlib/Makefile.am

index 126fbdf9119c25cdc6b7de0804ec2cad22c9ac28..d4e6b54224ce7c12f1362d272fda5046852560a4 100644 (file)
--- a/src/gmxlib/Makefile.am
+++ b/src/gmxlib/Makefile.am
@@ -1,20 +1,20 @@
  ## Process this file with automake to produce Makefile.in
  #
-# Don't edit - this file is generated automatically from Makefile.am
-#
+# Note that Makefile is generated automatically from Makefile.in,
+# which is automatically generated from Makefile.am
  
  include $(srcdir)/../Makefile.inc
  # 
  # This is necessary for VPATH builds (and thus distcheck) to work, 
-# but it can probably included in a nicer way...
+# but it can probably be included in a nicer way...
  INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
  
  # produce the gmx library
  #
  
-lib_LIBRARIES = libgmxXXX_SUFFIX_XXX.a
+lib_LTLIBRARIES = libgmxXXX_SUFFIX_XXX.la
  
-libgmxXXX_SUFFIX_XXX_a_SOURCES = \
+libgmxXXX_SUFFIX_XXX_la_SOURCES = \
         3dview.c        atomprop.c      block_tx.c      bondfree.c      \
         buffer.c        calcgrid.c      calch.c         inner.h         \
         confio.c        copyrite.c      disre.c         do_fit.c        \
@@ -35,11 +35,11 @@ libgmxXXX_SUFFIX_XXX_a_SOURCES = \
         xdrd.c          xtcio.c         xvgr.c          replace.h       \
         x86_cpu.c       
  
-EXTRA_libgmxXXX_SUFFIX_XXX_a_SOURCES = \
-       mpiio.c         libnet.c        x86_3dnow.asm   f77_wrappers.c  \
+EXTRA_libgmxXXX_SUFFIX_XXX_la_SOURCES = \
+       mpiio.c         libnet.c        x86_3dnow.S     f77_wrappers.c  \
         libxdrf.c       ftocstr.c       dumxdrf.c       mgmx.c          \
-       widget.c        widget.h        x86_cpuid.asm   axp_asm.S       \
-       x86_sse.asm     innerc.c        innerf.f
+       widget.c        widget.h        x86_cpuid.S     axp_asm.s       \
+       x86_sse.S       
  
  if USE_FORTRAN
  BUILT_SOURCES = innerf.f
@@ -47,27 +47,14 @@ else
  BUILT_SOURCES = innerc.c
  endif
  
-mpi_obj        = mpiio.o
-libnet_obj     = libnet.o
-inner_f77_obj  = innerf.o      f77_wrappers.o
-inner_c_obj    = innerc.o  
-xdr_obj                = libxdrf.o ftocstr.o
-noxdr_obj      = dumxdrf.o
-motif_obj       = mgmx.o widget.o
-x86_asm_obj     = x86_cpuid.o
-sse_obj         = x86_sse.o
-tdn_obj         = x86_3dnow.o 
-axp_asm_obj     = axp_asm.o
-
-libgmxXXX_SUFFIX_XXX_a_LIBADD = \
-       @PAR_OBJ@       @INNER_F77_OBJ@ @INNER_C_OBJ@   \
-       @AXP_ASM_OBJ@   @X86_ASM_OBJ@   @SSE_OBJ@       \
-       @TDN_OBJ@       @MOTIF_OBJ@     @XDR_OBJ@       
-
-libgmxXXX_SUFFIX_XXX_a_DEPENDENCIES = \
-       @PAR_OBJ@       @INNER_F77_OBJ@ @INNER_C_OBJ@   \
-       @AXP_ASM_OBJ@   @X86_ASM_OBJ@   @SSE_OBJ@       \
-       @TDN_OBJ@       @MOTIF_OBJ@     @XDR_OBJ@       
+#
+# NB: The contents of GMXLIB_COND_OBJ is defined in
+#     the main configure.in script file
+#
+
+libgmxXXX_SUFFIX_XXX_la_LIBADD = @GMXLIB_COND_OBJ@     
+
+libgmxXXX_SUFFIX_XXX_la_DEPENDENCIES = @GMXLIB_COND_OBJ@       
  
  # The inner loops
  innerf.f:      mkinl 
@@ -76,9 +63,21 @@ innerf.f:    mkinl
  innerc.c:      mkinl
                 ./mkinl c
  
-noinst_PROGRAMS  = mkinl
+# The compaq compiler is really stupid and thinks a .S file is some
+# kind of object. Instead it preprocesses the .s files.
+# To make this work with both gcc and the compaq compilers, we
+# do a workaround to make gcc preprocess the .s file instead:
+
+if GNU_CC
+axp_asm.lo:    axp_asm.s
+       $(COMPILE) -x assembler-with-cpp -c $(srcdir)/axp_asm.s
+endif
+
+# Mkinl is special - we cant use a noinst_PROGRAMS target, since it
+# might have to be compiled with a special non-MPI compiler whose files
+# can be executed on the build machine (i.e., not for the target host):
  
-mkinl_SOURCES   = \
+EXTRA_DIST      = \
         mkinl.c                 mkinl_declarations.c    mkinl_outerloop.c  \
         mkinl_innerloop.c       mkinl_calcdist.c        mkinl_invsqrt.c    \
         mkinl_recip.c           mkinl_interactions.c    metacode.c         \
@@ -112,5 +111,5 @@ mkinl:              $(MKINL_OBJ)
                 $(BUILD_COMPILE) -o $@ $(MKINL_OBJ)
  
  # clean things explicitly, since the target names might have changed
-CLEANFILES     = ${lib_LIBRARIES} *_d.a *_mpi.a *~ \\\#* innerc.c innerf.f
+CLEANFILES     = ${lib_LTLIBRARIES} *_d.la *_mpi.la *~ \\\#* innerc.c innerf.f mkinl
  
diff --git a/src/gmxlib/axp_asm.S b/src/gmxlib/axp_asm.s

similarity index 100%

rename from src/gmxlib/axp_asm.S

rename to src/gmxlib/axp_asm.s
diff --git a/src/gmxlib/bfunc.h b/src/gmxlib/bfunc.h

index 8657893ce2362ce849e89f025658d269d1e3daa2..e73c85e164a97bb90758974f5bbe0d456d1a5ca1 100644 (file)
--- a/src/gmxlib/bfunc.h
+++ b/src/gmxlib/bfunc.h
@@ -40,6 +40,21 @@ static char *SRCID_bfunc_h = "$Id$";
   *     Bcopy/Memcpy patch.
   *
  $Log$
+Revision 1.7  2001/06/20 10:34:01  lindahl
+
+Converted assembly to use gcc instead of nasm, updated html man
+pages.
+The x86 assembly loops is now a single option to configure,
+and the single/double prec. is controlled with --enable-float
+(default is yes), to be consistent with fftw.
+Removed the less common options from the summary printed by
+configure, but they are still available.
+Introduced libtool to create both static and dynamic libraries -
+you can control it with configure options. --disable-shared might
+be suitable for development work.
+To avoid compiling both PIC and non-PIC code you can try --with-pic,
+but the default is both.
+
  Revision 1.6  2001/05/14 17:58:06  lindahl
  
  Tagged files with gromacs 3.0 header
@@ -65,6 +80,9 @@ For instance the shared libraries do not work any longer...
   *
   */
  
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
  
  #if defined(SYSVBFUNC)
  #include <memory.h>
diff --git a/src/gmxlib/copyrite.c b/src/gmxlib/copyrite.c

index efcd5fa4a0c651570d1fd284f42c77f125f7353b..a8d65529435872841cc64baff37ea154c5fb0498 100644 (file)
--- a/src/gmxlib/copyrite.c
+++ b/src/gmxlib/copyrite.c
@@ -212,7 +212,8 @@ void CopyRight(FILE *out,char *szProgram)
    fprintf(out,"\n");
    
    for(i=0; (i<NCR); i++) 
-    sp_print(out,CopyrightText[i]);
+    fprintf(out,"  %s\n",CopyrightText[i]);
+
    
    sprintf(buf,"%s",szProgram);
  #ifdef DOUBLE
@@ -319,7 +320,7 @@ void please_cite(FILE *fp,char *key)
      { "Lindahl2001a",
        "E. Lindahl and B. Hess and D. van der Spoel",
        "GROMACS 3.0: A package for molecular simulation and trajectory analysis",
-      "Submitted",
+      "To appear in J. Mol. Mod.",
        0, 2001, 0, 0 }
    };
  #define NSTR (int)asize(citedb)
diff --git a/src/gmxlib/fnbf.c b/src/gmxlib/fnbf.c

index 60264d4756bbefd061e1fa653ba041ccdecfe8bd..59c657a2f45b92b51238374f36d9f5a92a3e8ee9 100644 (file)
--- a/src/gmxlib/fnbf.c
+++ b/src/gmxlib/fnbf.c
@@ -211,10 +211,12 @@ static real *_buf2=NULL;
  #endif
  
  #if (defined USE_SSE || defined USE_3DNOW)
+ 
    if(cpu_capabilities==UNKNOWN_CPU) 
      cpu_capabilities=check_x86cpu(log);
+ 
  #endif
-
+  
    if (eNL >= 0) {
      i0 = eNL;
      i1 = i0+1;
diff --git a/src/gmxlib/mkinl.h b/src/gmxlib/mkinl.h

index ebd3ee345f8c87692b37cdd8533276ab0afef381..14dbbc2d5db862b368b8a1c065b3c84233e82ab9 100644 (file)
--- a/src/gmxlib/mkinl.h
+++ b/src/gmxlib/mkinl.h
@@ -38,7 +38,11 @@
  #define _mkinl_h
  
  static char *SRCID_mkinl_h = "$Id$";
+
+#ifdef HAVE_CONFIG_H
  #include <config.h>
+#endif
+
  #include <types/simple.h>
  #include <metacode.h>
  
diff --git a/src/gmxlib/wman.c b/src/gmxlib/wman.c

index 7f12609583ee57e6bd3d8407aa04726233f849ec..2a663d7a5caa1e400e655a6e7a82e273106ac8f8 100644 (file)
--- a/src/gmxlib/wman.c
+++ b/src/gmxlib/wman.c
@@ -448,18 +448,22 @@ static void write_htmlman(FILE *out,
    
  #define NSR(s) check_html(s,program)
    
-  fprintf(out,"<TITLE>%s</TITLE>\n",program);
+  fprintf(out,"<HTML>\n<HEAD>\n<TITLE>%s</TITLE>\n",program);
    fprintf(out,"<LINK rel=stylesheet href=\"style.css\" type=\"text/css\">\n");
-  fprintf(out,"<BODY text=\"#000000\" bgcolor=\"#FFFFFF\" link=\"#0000EF\" vlink=\"#650065\" alink=\"#FF0000\">\n");
-  fprintf(out,"<H2>%s</H2>\n",program);
-  fprintf(out,"<CENTER><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0 COLS=2 WIDTH=\"98%%\">\n");
-  fprintf(out,"<TR>\n<TD><font size=-1><A HREF=\"../online.html\">Main Table of Contents</A></font></TD>\n");
-  fprintf(out,"<TD ALIGN=RIGHT><B>%s</B></TR>\n",GromacsVersion());
-  fprintf(out,"<TR><TD><font size=-1><A HREF=\"http://www.gromacs.org\">GROMACS homepage</A></font></TD>\n");
-  fprintf(out,"<TD ALIGN=RIGHT><B>%s</B></TR></TABLE></CENTER><HR>\n",mydate());
+  fprintf(out,"<BODY text=\"#000000\" bgcolor=\"#FFFFFF\" link=\"#0000FF\" vlink=\"#990000\" alink=\"#FF0000\">\n");
+  fprintf(out,"<table WIDTH=\"800\" NOBORDER >\n<TR>\n");
+  fprintf(out,"<td WIDTH=\"120\" HEIGHT=\"133\">\n"
+         "<a href=\"http://www.gromacs.org/\">"
+         "<img SRC=\"../gif/gmxlogo_small.jpg\""
+         "BORDER=0 height=133 width=116></a></td>");
+  fprintf(out,"<td ALIGN=LEFT VALIGN=TOP WIDTH=480>"
+         "<br><br><h2>GROMACS Online Reference:<br>%s</h2>",program);
+  fprintf(out,"<font size=-1><A HREF=\"../online.html\">Main Table of Contents</A></font><br>");
+  fprintf(out,"<br></td>\n<TD ALIGN=RIGHT VALIGN=BOTTOM><B>%s<br>\n",GromacsVersion());
+  fprintf(out,"%s</B></td></tr></TABLE>\n<HR>\n",mydate());
    
    if (nldesc > 0) {
-    fprintf(out,"<H3>Description</H3>\n");
+    fprintf(out,"<H3>Description</H3>\n<p>\n");
      for(i=0; (i<nldesc); i++) 
        fprintf(out,"%s\n",NSR(desc[i]));
    }
diff --git a/src/gmxlib/x86_3dnow.S b/src/gmxlib/x86_3dnow.S

new file mode 100644 (file)

index 0000000..5fab0de
--- /dev/null
+++ b/src/gmxlib/x86_3dnow.S
@@ -0,0 +1,16004 @@
+/*
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations 
+ *
+ *                        VERSION 3.0
+ * 
+ * Copyright (c) 1991-2001
+ * BIOSON Research Institute, Dept. of Biophysical Chemistry
+ * University of Groningen, The Netherlands
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org 
+ * 
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */    
+
+/* This file contains a subset of the gromacs innerloops
+ * manually written in assembly to optimize performance
+ * on AMD extended 3DNow-enabled processors like Athlon 
+ * and later generations. 
+ * Erik Lindahl, 2000-2001, erik@theophys.kth.se
+ *
+ * We use intel syntax for portability. There are probably some GNU-specific
+ * things here, but they are easy to fix.
+ */
+       
+.intel_syntax noprefix
+
+.text
+       
+mm_two:        
+       .long 0x40000000
+       .long 0x40000000
+mm_six:        
+       .long 0x40c00000
+       .long 0x40c00000
+mm_twelve:     
+       .long 0x41400000
+       .long 0x41400000
+
+       .align 4
+
+.globl check3dnow  /* try to issue an Extended 3DNow instruction */
+       .type check3dnow,@function
+check3dnow:    
+       femms
+       pswapd mm0,mm0
+       femms
+       ret
+
+                       
+.globl vecrecip_3dnow
+       .type vecrecip_3dnow,@function
+vecrecip_3dnow:        
+       push ebp
+       mov ebp,esp     
+       push eax
+       push ebx
+       push ecx
+       push edx
+
+       mov eax, [ebp + 8]
+       mov ebx, [ebp + 12]     
+       mov ecx, [ebp + 16]
+        mov edx, ecx
+        shr ecx, 2 
+        jecxz .vecrecip_tail
+        emms   
+.vecrecip_mainloop:    
+        movq mm0,[eax]
+       add eax,  8
+        pfrcp mm1,mm0
+       movq mm4,[eax]
+       pswapd mm0,mm0
+       add eax,  8 
+        pfrcp mm2,mm0
+       pswapd mm0,mm0
+        pfrcp mm5,mm4
+       pswapd mm4,mm4  
+       punpckldq mm1,mm2
+       pfrcp mm6,mm4
+       pswapd mm4,mm4
+       pfrcpit1 mm0,mm1
+       punpckldq mm5,mm6       
+       pfrcpit2 mm0,mm1
+        movq [ebx],mm0
+       pfrcpit1 mm4,mm5
+       add ebx,  8
+       pfrcpit2 mm4,mm5        
+        movq [ebx],mm4
+       add ebx,  8     
+        dec ecx
+        jecxz .vecrecip_tail
+        jmp short .vecrecip_mainloop
+.vecrecip_tail:
+        mov ecx,edx
+        and ecx,3
+        jecxz .vecrecip_end
+.vecrecip_tailloop:    
+        movd mm0,[eax]
+       add eax,  4
+        pfrcp mm1,mm0
+        pfrcpit1 mm0,mm1
+        pfrcpit2 mm0,mm1
+        movd [ebx],mm0 
+       add ebx,  4
+       dec ecx
+       jecxz .vecrecip_end
+       jmp short .vecrecip_tailloop
+.vecrecip_end: 
+       emms
+       pop edx
+       pop ecx
+       pop ebx
+       pop eax
+       leave
+       ret
+       
+
+.globl vecinvsqrt_3dnow
+       .type vecinvsqrt_3dnow,@function
+vecinvsqrt_3dnow:      
+       push ebp
+       mov ebp,esp     
+       push eax
+       push ebx
+       push ecx
+       push edx
+
+       mov eax, [ebp + 8]
+       mov ebx, [ebp + 12]     
+       mov ecx, [ebp + 16]
+        mov edx, ecx
+        shr ecx, 2 
+        jecxz .vecinvsqrt_tail
+        emms   
+.vecinvsqrt_mainloop:  
+        movq mm0,[eax]
+       add eax,  8
+        pfrsqrt mm1,mm0
+       movq mm4,[eax]
+       pswapd mm0,mm0
+       add eax,  8
+        pfrsqrt mm2,mm0
+       pswapd mm0,mm0
+        pfrsqrt mm5,mm4
+       pswapd mm4,mm4  
+       punpckldq mm1,mm2
+       pfrsqrt mm6,mm4
+       movq mm3,mm1
+       pswapd mm4,mm4
+       pfmul mm1,mm1
+       punpckldq mm5,mm6       
+       pfrsqit1 mm1,mm0
+       movq mm7,mm5    
+       pfrcpit2 mm1,mm3
+       pfmul mm5,mm5
+        movq [ebx],mm1
+       pfrsqit1 mm5,mm4
+       add ebx,  8
+       pfrcpit2 mm5,mm7        
+        movq [ebx],mm5
+       add ebx,  8     
+        dec ecx
+        jecxz .vecinvsqrt_tail
+        jmp short .vecinvsqrt_mainloop
+.vecinvsqrt_tail:
+        mov ecx,edx
+        and ecx,3
+        jecxz .vecinvsqrt_end
+.vecinvsqrt_tailloop:  
+        movd mm0,[eax]
+       add eax,  4
+        pfrsqrt mm1,mm0
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0
+        pfrcpit2 mm1,mm2
+        movd [ebx],mm1         
+       add ebx,  4
+       dec ecx
+       jecxz .vecinvsqrt_end
+       jmp short .vecinvsqrt_tailloop
+.vecinvsqrt_end:       
+       emms
+       pop edx
+       pop ecx
+       pop ebx
+       pop eax
+       leave
+       ret
+       
+
+.globl inl0100_3dnow
+       .type inl0100_3dnow,@function
+inl0100_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56
+.equ           Vnb,            60
+       /* stack offsets for local variables */
+.equ           is3,         0
+.equ           ii3,         4
+.equ           ix,          8
+.equ           iy,         12
+.equ           iz,         16
+.equ           vnbtot,     20  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,         28  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,        36  /* repeated (64bit) to fill 3dnow reg */
+.equ           six,        44  /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,     52  /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,       60
+.equ           innerjjnr,  64
+.equ           innerk,     68          
+.equ           fix,        72
+.equ           fiy,        76
+.equ           fiz,        80
+.equ           dx1,        84
+.equ           dy1,        88
+.equ           dz1,        92
+.equ           dx2,        96
+.equ           dy2,       100
+.equ           dz2,       104                                          
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 108            /* local stack space */
+       femms
+       /* move data to local stack */ 
+       movq  mm0, [mm_six]
+       movq  mm1, [mm_twelve]
+       movq  [esp + six ], mm0
+       movq  [esp + twelve ], mm1
+       /* assume we have at least one i particle - start directly */   
+.i0100_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift], 4          /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1. */
+       movd  mm1, [eax + ebx*4 + 8]
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm0, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm3, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx
+       pfadd mm1, mm3
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+                               
+       /* clear total potential and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fix],    mm7
+       movd  [esp + fiz],    mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /*  pointer to jjnr[nj0] */
+       sub   edx,  2
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0100_unroll_loop
+       jmp   .i0100_finish_inner
+.i0100_unroll_loop:
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */  
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrcp mm0, mm4                  /* lookup reciprocal seed */ 
+        pfrcp mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+                                       /* amd 3dnow N-R iteration to get full precision. */
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       
+       /* mm4 now contains invsq,
+        * do potential and fscal
+        */
+       movq  mm0, mm4
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5,mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */ 
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i0100_finish_inner
+       jmp   .i0100_unroll_loop
+.i0100_finish_inner:   
+       and [esp + innerk],  1
+       jnz  .i0100_single_inner
+       jmp  .i0100_updateouterdata             
+.i0100_single_inner:
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]          /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]           /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm4=rsq */
+       
+        pfrcp mm0,mm4
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       /* mm4=invsq */
+       /* calculate potentials and scalar force */
+       movq  mm0, mm4
+
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5, mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */  
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i0100_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0100_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i0100_outer
+.i0100_end:
+       femms
+       add esp, 108
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+       
+
+
+
+               
+               
+.globl inl0110_3dnow
+       .type inl0110_3dnow,@function
+inl0110_3dnow:  
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56      
+.equ           Vnb,            60                              
+.equ           nsatoms,        64              
+       /* stack offsets for local variables */
+.equ           is3,            0
+.equ           ii3,            4
+.equ           shX,            8
+.equ           shY,            12 
+.equ           shZ,            16      
+.equ           ix,             20
+.equ           iy,             24
+.equ           iz,             28      
+.equ           vnbtot,         32 /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             40 /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            48 /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            56 /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         64 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           72      
+.equ           innerjjnr0,     76
+.equ           innerk0,        80              
+.equ           innerjjnr,      84
+.equ           innerk,         88      
+.equ           fix,            92
+.equ           fiy,            96
+.equ           fiz,            100
+.equ           dx1,            104
+.equ           dy1,            108
+.equ           dz1,            112
+.equ           dx2,            116
+.equ           dy2,            120
+.equ           dz2,            124                                     
+.equ           nsvdwc,         128
+.equ           nscoul,         132
+.equ           nsvdw,          136
+.equ           solnr,          140             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 144            /* local stack space */
+       femms
+       movq  mm0, [mm_six]
+       movq  mm1, [mm_twelve]
+       movq  [esp + six],    mm0
+       movq  [esp + twelve], mm1
+       /* assume we have at least one i particle - start directly */           
+.i0110_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+       movq  [esp + shX], mm0
+       movd  [esp + shZ], mm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       pxor  mm7,mm7
+       movq  [esp + vnbtot], mm7
+       mov   [esp + solnr],  ebx
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]
+       
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i0110_mno_vdwc
+       jmp   .i0110_testvdw
+.i0110_mno_vdwc:
+       mov   ebx, [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0110_unroll_vdwc_loop
+       jmp   .i0110_finish_vdwc_inner
+.i0110_unroll_vdwc_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */  
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrcp mm0, mm4                  /* lookup reciprocal seed */ 
+        pfrcp mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+                                       /* amd 3dnow N-R iteration to get full precision */
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       
+       /* mm4 now contains invsq,
+        * do potential and fscal
+        */
+       movq  mm0, mm4
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5,mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i0110_finish_vdwc_inner
+       jmp   .i0110_unroll_vdwc_loop
+.i0110_finish_vdwc_inner:      
+       and [esp + innerk],  1
+       jnz  .i0110_single_vdwc_inner
+       jmp  .i0110_updateouterdata_vdwc                
+.i0110_single_vdwc_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm4=rsq */
+       
+        pfrcp mm0,mm4
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       /* mm4=invsq */ 
+       /* calculate potentials and scalar force */
+       movq  mm0, mm4
+
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5, mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i0110_updateouterdata_vdwc:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdwc]
+       jz  .i0110_testvdw
+       jmp .i0110_mno_vdwc
+.i0110_testvdw:        
+       mov  ebx,  [esp + nscoul]
+       add  [esp + solnr],  ebx
+
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i0110_mno_vdw
+       jmp  .i0110_last_mno
+.i0110_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0110_unroll_vdw_loop
+       jmp   .i0110_finish_vdw_inner
+.i0110_unroll_vdw_loop:        
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */  
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrcp mm0, mm4                  /* lookup reciprocal seed */ 
+        pfrcp mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+                                       /* amd 3dnow N-R iteration to get full precision */
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       
+       /* mm4 now contains invsq,
+        * do potential and fscal
+        */
+       movq  mm0, mm4
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5,mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i0110_finish_vdw_inner
+       jmp   .i0110_unroll_vdw_loop
+.i0110_finish_vdw_inner:       
+       and [esp + innerk],  1
+       jnz  .i0110_single_vdw_inner
+       jmp  .i0110_updateouterdata_vdw         
+.i0110_single_vdw_inner:       
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm4=rsq */
+       
+        pfrcp mm0,mm4
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       /* mm4=invsq */ 
+       /* calculate potentials and scalar force */
+       movq  mm0, mm4
+
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5, mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i0110_updateouterdata_vdw:    
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i0110_last_mno
+       jmp .i0110_mno_vdw
+       
+.i0110_last_mno:       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0110_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i0110_outer
+.i0110_end:
+       femms
+       add esp, 144
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl0300_3dnow
+       .type inl0300_3dnow,@function
+inl0300_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56      
+.equ           Vnb,            60
+.equ           tabscale,       64
+.equ           VFtab,          68
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ix,              8
+.equ           iy,             12
+.equ           iz,             16
+.equ           vnbtot,         20 /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             28 /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            36 /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            44 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             52 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            60 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           68
+.equ           innerjjnr,      72
+.equ           innerk,         76              
+.equ           fix,            80
+.equ           fiy,            84
+.equ           fiz,            88
+.equ           dx1,            92
+.equ           dy1,            96
+.equ           dz1,           100
+.equ           dx2,           104
+.equ           dy2,           108
+.equ           dz2,           112                                              
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 116            /* local stack space */
+       femms
+       /* move data to local stack */ 
+       movq  mm0, [mm_two]
+       movd  mm3, [ebp + tabscale]
+       movq  [esp + two],    mm0
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3  
+       /* assume we have at least one i particle - start directly */   
+.i0300_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm0, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm3, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx
+       pfadd mm1, mm3
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+                               
+       /* clear total potential and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fix],    mm7
+       movd  [esp + fiz],    mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  2
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0300_unroll_loop
+       jmp   .i0300_finish_inner
+.i0300_unroll_loop:
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       /* dispersion table */
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 3
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 3
+       punpckldq mm4, [edx + ecx*4 + 16]
+       punpckldq mm5, [edx + ecx*4 + 20]
+       punpckldq mm6, [edx + ecx*4 + 24]
+       punpckldq mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm1, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i0300_finish_inner
+       jmp   .i0300_unroll_loop
+.i0300_finish_inner:   
+       and [esp + innerk],  1
+       jnz  .i0300_single_inner
+       jmp  .i0300_updateouterdata             
+.i0300_single_inner:
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* dispersion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i0300_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0300_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i0300_outer
+.i0300_end:
+       femms
+       add esp, 116
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+                       
+       
+.globl inl0310_3dnow
+       .type inl0310_3dnow,@function
+inl0310_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56      
+.equ           Vnb,            60
+.equ           tabscale,       64
+.equ           VFtab,          68
+.equ           nsatoms,        72              
+       /* stack offsets for local variables */
+.equ           is3,            0
+.equ           ii3,            4
+.equ           shX,            8
+.equ           shY,           12 
+.equ           shZ,           16       
+.equ           ix,            20
+.equ           iy,            24
+.equ           iz,            28       
+.equ           vnbtot,        32 /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,            40 /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,           48 /* repeated (64bit) to fill 3dnow reg */
+.equ           two,           56 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,            64 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,           72 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,          80       
+.equ           innerjjnr0,    84
+.equ           innerk0,       88               
+.equ           innerjjnr,     92
+.equ           innerk,        96       
+.equ           fix,          100
+.equ           fiy,          104
+.equ           fiz,          108
+.equ           dx1,          112
+.equ           dy1,          116
+.equ           dz1,          120
+.equ           dx2,          124
+.equ           dy2,          128
+.equ           dz2,          132                                                               
+.equ           nsvdwc,       136
+.equ           nscoul,       140
+.equ           nsvdw,        144
+.equ           solnr,        148               
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 152            /* local stack space */
+       femms
+       movq  mm0, [mm_two]
+       movd  mm3, [ebp + tabscale]
+       movq  [esp + two],    mm0
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3  
+       
+       /* assume we have at least one i particle - start directly */           
+.i0310_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+       movq  [esp + shX], mm0
+       movd  [esp + shZ], mm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       pxor  mm7,mm7
+       movq  [esp + vnbtot], mm7
+       mov   [esp + solnr],  ebx
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]
+       
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i0310_mno_vdwc
+       jmp   .i0310_testvdw
+.i0310_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0310_unroll_vdwc_loop
+       jmp   .i0310_finish_vdwc_inner
+.i0310_unroll_vdwc_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       /* dispersion table */
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 3
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 3
+       punpckldq mm4, [edx + ecx*4 + 16]
+       punpckldq mm5, [edx + ecx*4 + 20]
+       punpckldq mm6, [edx + ecx*4 + 24]
+       punpckldq mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm1, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+               
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i0310_finish_vdwc_inner
+       jmp   .i0310_unroll_vdwc_loop
+.i0310_finish_vdwc_inner:      
+       and [esp + innerk],  1
+       jnz  .i0310_single_vdwc_inner
+       jmp  .i0310_updateouterdata_vdwc                
+.i0310_single_vdwc_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* dispersion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i0310_updateouterdata_vdwc:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdwc]
+       jz  .i0310_testvdw
+       jmp .i0310_mno_vdwc
+.i0310_testvdw:        
+       mov  ebx,  [esp + nscoul]
+       add  [esp + solnr],  ebx
+
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i0310_mno_vdw
+       jmp  .i0310_last_mno
+.i0310_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0310_unroll_vdw_loop
+       jmp   .i0310_finish_vdw_inner
+.i0310_unroll_vdw_loop:        
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       /* dispersion table */
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 3
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 3
+       punpckldq mm4, [edx + ecx*4 + 16]
+       punpckldq mm5, [edx + ecx*4 + 20]
+       punpckldq mm6, [edx + ecx*4 + 24]
+       punpckldq mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm1, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i0310_finish_vdw_inner
+       jmp   .i0310_unroll_vdw_loop
+.i0310_finish_vdw_inner:       
+       and [esp + innerk],  1
+       jnz  .i0310_single_vdw_inner
+       jmp  .i0310_updateouterdata_vdw         
+.i0310_single_vdw_inner:       
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 3
+       /* dispersion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i0310_updateouterdata_vdw:    
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i0310_last_mno
+       jmp .i0310_mno_vdw
+       
+.i0310_last_mno:       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0310_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i0310_outer
+.i0310_end:
+       femms
+       add esp, 152
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+.globl inl1000_3dnow
+       .type inl1000_3dnow,@function
+inl1000_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ix,              8
+.equ           iy,             12
+.equ           iz,             16
+.equ           iq,             20              /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          28 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      36
+.equ           innerk,         40              
+.equ           fix,            44
+.equ           fiy,            48
+.equ           fiz,            52
+.equ           dx1,            56
+.equ           dy1,            60
+.equ           dz1,            64
+.equ           dx2,            68
+.equ           dy2,            72
+.equ           dz2,            76                                                                      
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 80             /* 80 bytes local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+.i1000_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm0, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm3, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx
+       pfadd mm1, mm3
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+                               
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  2
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i1000_unroll_loop
+       jmp   .i1000_finish_inner
+.i1000_unroll_loop:    
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+       movd mm7, [ecx + ebx*4]          /* charge[jnr2] */
+       punpckldq mm3,mm7                /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       movq mm1,mm0
+       pfmul mm0,mm0
+       /* mm0 now contains invsq, and mm1 invsqrt
+        * do potential and fscal
+        */
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+       
+       pfmul mm3,mm1           /* 6 has both vcoul */
+       pfmul mm0,mm3           /* 0 has both fscal */
+
+       /* update vctot */
+
+       pfadd mm3, [esp + vctot]      /* add the earlier value */ 
+       movq [esp + vctot], mm3       /* store the sum */
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i1000_finish_inner
+       jmp   .i1000_unroll_loop
+.i1000_finish_inner:   
+       and [esp + innerk],  1
+       jnz  .i1000_single_inner
+       jmp  .i1000_updateouterdata             
+.i1000_single_inner:   
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm6, [esp + iq]
+       movd mm7, [ecx + eax*4]
+       pfmul mm6, mm7          /* mm6=qq */
+       
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm2, [esi + eax*4]
+       movd  mm3, [esi + eax*4 + 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq  [esp + dx1], mm0
+       pfmul mm0,mm0
+       movd  [esp + dz1], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfacc mm0, mm1          /* mm0=rsq */
+       
+        pfrsqrt mm1,mm0
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+       /* update vctot */
+       movq mm5, [esp + vctot]
+       pfadd mm5, mm6
+       movq [esp + vctot], mm5
+       /* spread fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dx1]
+       movd mm1,  [esp + dz1]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+       /* update i particle force */
+       movq mm2,  [esp + fix]
+       movd mm3,  [esp + fiz]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fix], mm2
+       movd [esp + fiz], mm3
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax *4+ 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+       /* done! */
+.i1000_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i1000_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i1000_outer
+.i1000_end:
+       femms
+       add esp, 80
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+.globl inl1010_3dnow
+       .type inl1010_3dnow,@function
+inl1010_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56
+.equ           nsatoms,        60              
+       /* stack offsets for local variables */
+.equ           is3,            0
+.equ           ii3,            4
+.equ           shX,            8
+.equ           shY,            12 
+.equ           shZ,            16      
+.equ           ix,             20
+.equ           iy,             24
+.equ           iz,             28      
+.equ           iq,             32              /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          40 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr0,     48
+.equ           innerk0,        52              
+.equ           innerjjnr,      56
+.equ           innerk,         60              
+.equ           fix,            64
+.equ           fiy,            68
+.equ           fiz,            72
+.equ           dx1,            76
+.equ           dy1,            80
+.equ           dz1,            84
+.equ           dx2,            88
+.equ           dy2,            92
+.equ           dz2,            96
+.equ           nscoul,        100
+.equ           solnr,         104              
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 108            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+       add   [ebp + nsatoms],  8
+
+.i1010_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+       movq  [esp + shX], mm0
+       movd  [esp + shZ], mm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   eax, [ebp + nsatoms]
+       mov   ecx, [eax]
+       add   [ebp + nsatoms],  12
+       mov   [esp + nscoul], ecx
+               
+       /* clear potential */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       mov   [esp + solnr], ebx
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]
+
+       mov   ecx, [esp + nscoul]
+       cmp   ecx,  0
+       jnz   .i1010_mno_coul
+       jmp   .i1010_last_mno
+.i1010_mno_coul:                               
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i1010_unroll_coul_loop
+       jmp   .i1010_finish_coul_inner
+.i1010_unroll_coul_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+       movd mm7, [ecx + ebx*4]          /* charge[jnr2] */
+       punpckldq mm3,mm7                /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       movq mm1,mm0
+       pfmul mm0,mm0
+       /* mm0 now contains invsq, and mm1 invsqrt */
+       /* do potential and fscal */
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+       
+       pfmul mm3,mm1           /* 6 has both vcoul */
+       pfmul mm0,mm3           /* 0 has both fscal */
+
+       /* update vctot */
+
+       pfadd mm3, [esp + vctot]      /* add the earlier value */ 
+       movq [esp + vctot], mm3       /* store the sum */
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i1010_finish_coul_inner
+       jmp   .i1010_unroll_coul_loop
+.i1010_finish_coul_inner:      
+       and [esp + innerk],  1
+       jnz  .i1010_single_coul_inner
+       jmp  .i1010_updateouterdata_coul                
+.i1010_single_coul_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm6, [esp + iq]
+       movd mm7, [ecx + eax*4]
+       pfmul mm6, mm7          /* mm6=qq */
+       
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm2, [esi + eax*4]
+       movd  mm3, [esi + eax*4 + 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq  [esp + dx1], mm0
+       pfmul mm0,mm0
+       movd  [esp + dz1], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfacc mm0, mm1          /* mm0=rsq */
+       
+        pfrsqrt mm1,mm0
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+       /* update vctot */
+       movq mm5, [esp + vctot]
+       pfadd mm5, mm6
+       movq [esp + vctot], mm5
+       /* spread fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dx1]
+       movd mm1,  [esp + dz1]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+       /* update i particle force */
+       movq mm2,  [esp + fix]
+       movd mm3,  [esp + fiz]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fix], mm2
+       movd [esp + fiz], mm3
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax *4+ 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+       /* done! */
+.i1010_updateouterdata_coul:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nscoul]
+       jz  .i1010_last_mno
+       jmp .i1010_mno_coul
+.i1010_last_mno:       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i1010_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i1010_outer
+.i1010_end:
+       femms
+       add esp, 108
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+                       
+.globl inl1020_3dnow
+       .type inl1020_3dnow,@function
+inl1020_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20/* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28/* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36/* repeated (64bit) to fill 3dnow reg */
+.equ           iqO,            44              /* repeated (64bit) to fill 3dnow reg */
+.equ           iqH,            52              /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          60/* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      68
+.equ           innerk,         72              
+.equ           fixO,           76 
+.equ           fiyO,           80
+.equ           fizO,           84
+.equ           fixH,           88/* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           96/* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           104         /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            112
+.equ           dyO,            116
+.equ           dzO,            120
+.equ           dxH,            124         /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            132         /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            140         /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 148            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] */
+       pfmul mm2, mm1          
+       movq  [esp + iqO], mm2          /* iqO = facel*charge[ii] */
+       
+       movd  mm2, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] */
+       pfmul mm2, mm1
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iqH], mm2          /* iqH = facel*charge[i.i0+1] */
+.i1020_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6 */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2 */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp storage for iz) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+                                       
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + fixO],   mm7
+       movd  [esp + fizO],   mm7
+       movq  [esp + fixH],   mm7
+       movq  [esp + fiyH],   mm7
+       movq  [esp + fizH],   mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i1020_inner_loop:     
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+       prefetch [ecx + 16]        /* prefetch data - trial and error says 16 is best */
+
+       mov ecx, [ebp + charge]
+       movd mm7, [ecx + eax*4]
+       punpckldq mm7,mm7
+       movq mm6,mm7
+       pfmul mm6, [esp + iqO]
+       pfmul mm7, [esp + iqH]  /* mm6=qqO, mm7=qqH */
+       
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3 */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i1020_updateouterdata
+       jmp .i1020_inner_loop
+.i1020_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+       
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i1020_end
+       /* not last, iterate once more! */
+       jmp .i1020_outer
+.i1020_end:
+       femms
+       add esp, 148
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+.globl inl1030_3dnow
+       .type inl1030_3dnow,@function
+inl1030_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                                              
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20/* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28/* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36/* repeated (64bit) to fill 3dnow reg */
+.equ           qqOO,           44              /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOH,           52              /* repeated (64bit) to fill 3dnow reg */
+.equ           qqHH,           60      /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          68/* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      76
+.equ           innerk,         80              
+.equ           fixO,           84 
+.equ           fiyO,           88
+.equ           fizO,           92
+.equ           fixH,           96/* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,          104         /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,          112         /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,           120
+.equ           dyO,           124
+.equ           dzO,           128
+.equ           dxH,           132         /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,           140         /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,           148         /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 156            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]        /* mm1=facel */
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] (O) */
+       movd  mm3, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] (H) */ 
+       movq  mm4, mm2  
+       pfmul mm4, mm1
+       movq  mm6, mm3
+       pfmul mm6, mm1
+       movq  mm5, mm4
+       pfmul mm4, mm2                  /* mm4=qqOO*facel */
+       pfmul mm5, mm3                  /* mm5=qqOH*facel */
+       pfmul mm6, mm3                  /* mm6=qqHH*facel */
+       punpckldq mm5,mm5               /* spread to both halves */
+       punpckldq mm6,mm6               /* spread to both halves */
+       movq  [esp + qqOO], mm4
+       movq  [esp + qqOH], mm5
+       movq  [esp + qqHH], mm6
+.i1030_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6 */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2 */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp storage for iz) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + fixO],  mm7
+       movq  [esp + fizO],  mm7
+       movq  [esp + fixH],  mm7
+       movq  [esp + fiyH],  mm7
+       movq  [esp + fizH],  mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i1030_inner_loop:
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+
+       movd  mm6, [esp + qqOO]
+       movq  mm7, [esp + qqOH]
+
+       lea   eax, [eax + eax*2]
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm0
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3 */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+
+       /* interactions with j H1 */
+       movq  mm0, [esi + eax*4 + 12]
+       movd  mm1, [esi + eax*4 + 20]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       movd mm6, [esp + qqOH]
+       movq mm7, [esp + qqHH]
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3 */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 12]
+       movd mm3,  [edi + eax*4 + 20]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 12], mm2
+       movd [edi + eax*4 + 20], mm3
+
+       /* interactions with j H2 */
+       movq  mm0, [esi + eax*4 + 24]
+       movd  mm1, [esi + eax*4 + 32]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+
+       movd mm6, [esp + qqOH]
+       movq mm7, [esp + qqHH]
+
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3 */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4  
+
+       /* pack j forces from H in the same form as the oxygen force */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 24]
+       movd mm3,  [edi + eax*4 + 32]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 24], mm2
+       movd [edi + eax*4 + 32], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i1030_updateouterdata
+       jmp .i1030_inner_loop   
+.i1030_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i1030_end
+       /* not last, iterate once more! */
+       jmp .i1030_outer
+.i1030_end:
+       femms
+       add esp, 156
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+.globl inl1100_3dnow
+       .type inl1100_3dnow,@function
+inl1100_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72      
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ix,              8
+.equ           iy,             12
+.equ           iz,             16
+.equ           iq,             20 /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          28 /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         36 /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             44 /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            52 /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            60 /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         68 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           76
+.equ           innerjjnr,      80
+.equ           innerk,         84              
+.equ           fix,            88
+.equ           fiy,            92
+.equ           fiz,            96
+.equ           dx1,           100
+.equ           dy1,           104
+.equ           dz1,           108
+.equ           dx2,           112
+.equ           dy2,           116
+.equ           dz2,           120                                              
+       push ebp
+       mov ebp,esp
+       
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 124            /* local stack space */
+       femms
+       /* move data to local stack */ 
+       movq  mm0, [mm_six]
+       movq  mm1, [mm_twelve]
+       movq  [esp + six],    mm0
+       movq  [esp + twelve], mm1
+       /* assume we have at least one i particle - start directly */   
+.i1100_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm0, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm3, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx
+       pfadd mm1, mm3
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+                               
+       /* clear total potential and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot],  mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fix],    mm7
+       movd  [esp + fiz],    mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  2
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i1100_unroll_loop
+       jmp   .i1100_finish_inner
+.i1100_unroll_loop:
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       movq mm1,mm0
+       pfmul mm0,mm0
+       /* mm0 now contains invsq, and mm1 invsqrt */
+       /* do potential and fscal */
+       movq mm4, mm0
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm3, mm1          /* mm3 has vcoul for both interactions */
+       movq  mm7, mm3          /* use mm7 for sum to make fscal */ 
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm7,mm4
+       pfadd mm7, mm5
+       pfmul mm0, mm7        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* update vctot */
+       pfadd mm3, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm3       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i1100_finish_inner
+       jmp   .i1100_unroll_loop
+.i1100_finish_inner:   
+       and [esp + innerk],  1
+       jnz  .i1100_single_inner
+       jmp  .i1100_updateouterdata             
+.i1100_single_inner:
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       movq  mm1, mm0
+       pfmul mm0, mm0          /* mm0=invsq */
+       /* calculate potentials and scalar force */
+       movq mm4, mm0
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm3, mm1          /* mm3 has vcoul for both interactions */
+       movq  mm7, mm3          /* use mm7 for sum to make fscal */ 
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm7,mm4
+       pfadd mm7, mm5
+       pfmul mm0, mm7        /* mm0 is total fscal now */
+
+       /* update vctot */
+       pfadd mm3, [esp + vctot]
+       movq [esp + vctot], mm3
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i1100_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i1100_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i1100_outer
+.i1100_end:
+       femms
+       add esp, 124
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+
+
+
+.globl inl1110_3dnow
+       .type inl1110_3dnow,@function
+inl1110_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72                              
+.equ           nsatoms,        76              
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           shX,             8
+.equ           shY,             12 
+.equ           shZ,            16      
+.equ           ix,             20
+.equ           iy,             24
+.equ           iz,             28      
+.equ           iq,             32               /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          40 /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         48 /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             56 /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            64 /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            72 /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         80 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           88      
+.equ           innerjjnr0,     92
+.equ           innerk0,        96              
+.equ           innerjjnr,     100
+.equ           innerk,        104      
+.equ           fix,           108
+.equ           fiy,           112
+.equ           fiz,           116
+.equ           dx1,           120
+.equ           dy1,           124
+.equ           dz1,           128
+.equ           dx2,           132
+.equ           dy2,           136
+.equ           dz2,           140                                                              
+.equ           nsvdwc,        144
+.equ           nscoul,        148
+.equ           nsvdw,         152
+.equ           solnr,         156              
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 160            /* local stack space */
+       femms
+       movq  mm0, [mm_six]
+       movq  mm1, [mm_twelve]
+       movq  [esp + six],    mm0
+       movq  [esp + twelve], mm1
+       /* assume we have at least one i particle - start directly */           
+.i1110_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+       movq  [esp + shX], mm0
+       movd  [esp + shZ], mm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       pxor  mm7,mm7
+       movq  [esp + vctot],  mm7
+       movq  [esp + vnbtot], mm7
+       mov   [esp + solnr],  ebx
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]
+       
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i1110_mno_vdwc
+       jmp   .i1110_testcoul
+.i1110_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i1110_unroll_vdwc_loop
+       jmp   .i1110_finish_vdwc_inner
+.i1110_unroll_vdwc_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       movq mm1,mm0
+       pfmul mm0,mm0
+       /* mm0 now contains invsq, and mm1 invsqrt */
+       /* do potential and fscal */
+       movq mm4, mm0
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm3, mm1          /* mm3 has vcoul for both interactions */
+       movq  mm7, mm3          /* use mm7 for sum to make fscal */ 
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm7,mm4
+       pfadd mm7, mm5
+       pfmul mm0, mm7        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* update vctot */
+       pfadd mm3, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm3       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i1110_finish_vdwc_inner
+       jmp   .i1110_unroll_vdwc_loop
+.i1110_finish_vdwc_inner:      
+       and [esp + innerk],  1
+       jnz  .i1110_single_vdwc_inner
+       jmp  .i1110_updateouterdata_vdwc                
+.i1110_single_vdwc_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       movq  mm1, mm0
+       pfmul mm0, mm0          /* mm0=invsq */
+       /* calculate potentials and scalar force */
+       movq mm4, mm0
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm3, mm1          /* mm3 has vcoul for both interactions */
+       movq  mm7, mm3          /* use mm7 for sum to make fscal */ 
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm7,mm4
+       pfadd mm7, mm5
+       pfmul mm0, mm7        /* mm0 is total fscal now */
+
+       /* update vctot */
+       pfadd mm3, [esp + vctot]
+       movq [esp + vctot], mm3
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i1110_updateouterdata_vdwc:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec  dword ptr [esp + nsvdwc]
+       jz  .i1110_testcoul
+       jmp .i1110_mno_vdwc
+.i1110_testcoul:       
+       mov  ecx, [esp + nscoul]
+       cmp  ecx,  0
+       jnz  .i1110_mno_coul
+       jmp  .i1110_testvdw
+.i1110_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i1110_unroll_coul_loop
+       jmp   .i1110_finish_coul_inner
+.i1110_unroll_coul_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+       movd mm7, [ecx + ebx*4]          /* charge[jnr2] */
+       punpckldq mm3,mm7                /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       movq mm1,mm0
+       pfmul mm0,mm0
+       /* mm0 now contains invsq, and mm1 invsqrt */
+       /* do potential and fscal */
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+       
+       pfmul mm3,mm1           /* 6 has both vcoul */
+       pfmul mm0,mm3           /* 0 has both fscal */
+
+       /* update vctot */
+
+       pfadd mm3, [esp + vctot]      /* add the earlier value */ 
+       movq [esp + vctot], mm3       /* store the sum */
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i1110_finish_coul_inner
+       jmp   .i1110_unroll_coul_loop
+.i1110_finish_coul_inner:      
+       and [esp + innerk],  1
+       jnz  .i1110_single_coul_inner
+       jmp  .i1110_updateouterdata_coul                
+.i1110_single_coul_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm6, [esp + iq]
+       movd mm7, [ecx + eax*4]
+       pfmul mm6, mm7          /* mm6=qq */
+       
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm2, [esi + eax*4]
+       movd  mm3, [esi + eax*4 + 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq  [esp + dx1], mm0
+       pfmul mm0,mm0
+       movd  [esp + dz1], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfacc mm0, mm1          /* mm0=rsq */
+       
+        pfrsqrt mm1,mm0
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+       /* update vctot */
+       movq mm5, [esp + vctot]
+       pfadd mm5, mm6
+       movq [esp + vctot], mm5
+       /* spread fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dx1]
+       movd mm1,  [esp + dz1]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+       /* update i particle force */
+       movq mm2,  [esp + fix]
+       movd mm3,  [esp + fiz]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fix], mm2
+       movd [esp + fiz], mm3
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax *4+ 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+       /* done! */
+.i1110_updateouterdata_coul:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nscoul]
+       jz  .i1110_testvdw
+       jmp .i1110_mno_coul
+.i1110_testvdw:        
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i1110_mno_vdw
+       jmp  .i1110_last_mno
+.i1110_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i1110_unroll_vdw_loop
+       jmp   .i1110_finish_vdw_inner
+.i1110_unroll_vdw_loop:        
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */  
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       movq mm1,mm0
+       pfmul mm0,mm0
+       /* mm0 now contains invsq, and mm1 invsqrt */
+       /* do potential and fscal */
+       movq mm4, mm0
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       movq  mm7, mm5
+       pfsub mm7,mm4
+       pfmul mm0, mm7        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i1110_finish_vdw_inner
+       jmp   .i1110_unroll_vdw_loop
+.i1110_finish_vdw_inner:       
+       and [esp + innerk],  1
+       jnz  .i1110_single_vdw_inner
+       jmp  .i1110_updateouterdata_vdw         
+.i1110_single_vdw_inner:       
+       /* a single j particle iteration here - compare with the unrolled code for comments */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       movq  mm1, mm0
+       pfmul mm0, mm0          /* mm0=invsq */
+       /* calculate potentials and scalar force */
+       movq mm4, mm0
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       movq  mm7, mm5
+       pfsub mm7,mm4
+       pfmul mm0, mm7        /* mm0 is total fscal now */
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i1110_updateouterdata_vdw:    
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i1110_last_mno
+       jmp .i1110_mno_vdw
+       
+.i1110_last_mno:       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i1110_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i1110_outer
+.i1110_end:
+       femms
+       add esp, 160
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl1120_3dnow
+       .type inl1120_3dnow,@function
+inl1120_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72                              
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqO,            44  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqH,            52  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          60  /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         68  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             76  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            84  /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            92  /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         100 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           108 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      116
+.equ           innerk,         120     
+.equ           fixO,           124
+.equ           fiyO,           128
+.equ           fizO,           132
+.equ           fixH,           136  /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           144  /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           152  /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            160
+.equ           dyO,            164
+.equ           dzO,            168
+.equ           dxH,            172  /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            180  /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            188  /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 196            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] */
+       pfmul mm2, mm1          
+       movq  [esp + iqO], mm2          /* iqO = facel*charge[ii] */
+       
+       movd  mm2, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] */
+       pfmul mm2, mm1
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iqH], mm2          /* iqH = facel*charge[i.i0+1] */
+
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[i.i0] */ 
+       mov   [esp + ntia], ecx
+       
+       movq  mm3, [mm_six]
+       movq  mm4, [mm_twelve]
+       movq  [esp + six],    mm3
+       movq  [esp + twelve], mm4  
+.i1120_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+                                       
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fixO],   mm7
+       movd  [esp + fizO],   mm7
+       movq  [esp + fixH],   mm7
+       movq  [esp + fiyH],   mm7
+       movq  [esp + fizH],   mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i1120_inner_loop:     
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+       prefetch [ecx + 16]        /* prefetch data - trial and error says 16 is best */
+
+       mov ecx, [ebp + charge]
+       movd mm7, [ecx + eax*4]
+       punpckldq mm7,mm7
+       movq mm6,mm7
+       pfmul mm6, [esp + iqO]
+       pfmul mm7, [esp + iqH]  /* mm6=qqO, mm7=qqH */
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr] */
+       mov ecx, [ebp + nbfp]
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [ecx + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [ecx + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5   
+       
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+
+       movq  mm0, mm4
+       pfmul mm0, mm4
+       pfmul mm0, mm4          /* mm0=rinvsix */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm2=rintwelve */
+       
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       movq  mm1, mm6          /* use mm1 for fscal sum */
+
+       /* LJ for the oxygen */
+       pfmul mm0, [esp + c6]    
+       pfmul mm2, [esp + c12]   
+
+       /* calc nb potential */
+       movq mm5, mm2
+       pfsub mm5, mm0
+
+       /* calc nb force */
+       pfmul mm0, [esp + six]
+       pfmul mm2, [esp + twelve]
+       
+       /* increment scalar force */
+       pfsub mm1, mm0
+       pfadd mm1, mm2
+       pfmul mm4, mm1          /* total scalar force on oxygen. */
+       
+       /* update nb potential */
+       pfadd mm5, [esp + vnbtot]
+       movq [esp + vnbtot], mm5
+       
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3. */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's. */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i1120_updateouterdata
+       jmp .i1120_inner_loop
+.i1120_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+       
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* same for Vnb */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i1120_end
+       /* not last, iterate once more! */
+       jmp .i1120_outer
+.i1120_end:
+       femms
+       add esp, 196
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+
+.globl inl1130_3dnow
+       .type inl1130_3dnow,@function
+inl1130_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                                              
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72                      
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOO,           44  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOH,           52  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqHH,           60  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             68  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            76  /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            84  /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         92  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          100 /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         108 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      116
+.equ           innerk,         120     
+.equ           fixO,           124
+.equ           fiyO,           128
+.equ           fizO,           132
+.equ           fixH,           136 /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           144 /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           152 /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            160
+.equ           dyO,            164
+.equ           dzO,            168
+.equ           dxH,            172 /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            180 /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            188 /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 196            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]        /* mm1=facel */
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] (O) */
+       movd  mm3, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] (H) */ 
+       movq  mm4, mm2  
+       pfmul mm4, mm1
+       movq  mm6, mm3
+       pfmul mm6, mm1
+       movq  mm5, mm4
+       pfmul mm4, mm2                  /* mm4=qqOO*facel */
+       pfmul mm5, mm3                  /* mm5=qqOH*facel */
+       pfmul mm6, mm3                  /* mm6=qqHH*facel */
+       punpckldq mm5,mm5               /* spread to both halves */
+       punpckldq mm6,mm6               /* spread to both halves */
+       movq  [esp + qqOO], mm4
+       movq  [esp + qqOH], mm5
+       movq  [esp + qqHH], mm6
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       mov   edx, ecx
+       imul  ecx, [ebp + ntype]
+       add   edx, ecx
+       mov   eax, [ebp + nbfp]
+       movd  mm0, [eax + edx*4]          
+       movd  mm1, [eax + edx*4 + 4]
+       movq  [esp + c6], mm0
+       movq  [esp + c12], mm1
+       movq  mm2, [mm_six]
+       movq  mm3, [mm_twelve]
+       movq  [esp + six], mm2
+       movq  [esp + twelve], mm3
+.i1130_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fixO],  mm7
+       movq  [esp + fizO],  mm7
+       movq  [esp + fixH],  mm7
+       movq  [esp + fiyH],  mm7
+       movq  [esp + fizH],  mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i1130_inner_loop:
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+
+       movd  mm6, [esp + qqOO]
+       movq  mm7, [esp + qqOH]
+
+       lea   eax, [eax + eax*2]
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm0
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */ OO
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */ OO
+
+       movq mm2, mm4
+       pfmul mm2, mm4
+       pfmul mm2, mm4
+       movq mm0, mm2
+       pfmul mm0,mm0
+       pfmul mm2, [esp + c6]
+       pfmul mm0, [esp + c12]
+       movq mm5, mm0
+       pfsub mm5, mm2          /* vnb */
+
+       pfmul mm2, [esp + six]
+       pfmul mm0, [esp + twelve]
+
+       pfsub mm0, mm2
+       
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfadd mm0, mm6
+       pfmul mm4, mm0          /* mm4=fscalar */ 
+
+       /* update nb potential */
+       pfadd mm5, [esp + vnbtot]
+       movq [esp + vnbtot], mm5
+
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3 */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's. */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+
+       /* interactions with j H1 */
+       movq  mm0, [esi + eax*4 + 12]
+       movd  mm1, [esi + eax*4 + 20]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       movd mm6, [esp + qqOH]
+       movq mm7, [esp + qqHH]
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3 */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's. */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 12]
+       movd mm3,  [edi + eax*4 + 20]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 12], mm2
+       movd [edi + eax*4 + 20], mm3
+
+       /* interactions with j H2 */
+       movq  mm0, [esi + eax*4 + 24]
+       movd  mm1, [esi + eax*4 + 32]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+
+       movd mm6, [esp + qqOH]
+       movq mm7, [esp + qqHH]
+
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       movq  mm4, mm1
+       pfmul mm4, mm4          /* mm4=invsq */
+       /* calculate potential and scalar force */
+       pfmul mm6, mm1          /* mm6=vcoul */
+       pfmul mm4, mm6          /* mm4=fscalar */ 
+
+       pfrsqrt mm5, mm3
+       pswapd mm3,mm3
+       pfrsqrt mm2, mm3
+       pswapd mm3,mm3
+       punpckldq mm5,mm2       /* seeds are in mm5 now, and rsq in mm3. */
+
+       movq mm2, mm5
+       pfmul mm5,mm5
+        pfrsqit1 mm5,mm3                               
+        pfrcpit2 mm5,mm2       /* mm5=invsqrt */
+       movq mm3,mm5
+       pfmul mm3,mm3           /* mm3=invsq */
+       pfmul mm7, mm5          /* mm7=vcoul */
+       pfmul mm3, mm7          /* mm3=fscal for the two H's. */
+
+       /* update vctot */
+       pfadd mm7, mm6
+       pfadd mm7, [esp + vctot]
+       movq [esp + vctot], mm7
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm4,mm4
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm4
+       pfmul mm1, mm4
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm3
+       pfmul mm6, mm3
+       pfmul mm7, mm3
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4  
+
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 24]
+       movd mm3,  [edi + eax*4 + 32]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 24], mm2
+       movd [edi + eax*4 + 32], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i1130_updateouterdata
+       jmp .i1130_inner_loop   
+.i1130_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnbtot[gid] */
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i1130_end
+       /* not last, iterate once more! */
+       jmp .i1130_outer
+.i1130_end:
+       femms
+       add esp, 196
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl3000_3dnow
+       .type inl3000_3dnow,@function
+inl3000_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           tabscale,       60
+.equ           VFtab,          64
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ix,              8
+.equ           iy,             12
+.equ           iz,             16
+.equ           iq,             20 /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          28 /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            36 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             44 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            52 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           60
+.equ           innerjjnr,      64
+.equ           innerk,         68              
+.equ           fix,            72
+.equ           fiy,            76
+.equ           fiz,            80
+.equ           dx1,            84
+.equ           dy1,            88
+.equ           dz1,            92
+.equ           dx2,            96
+.equ           dy2,            100
+.equ           dz2,            104                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 108            /* local stack space */
+       femms
+       /* move data to local stack */ 
+       movq  mm0, [mm_two]
+       movd  mm3, [ebp + tabscale]
+       movq  [esp + two],    mm0
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3  
+       /* assume we have at least one i particle - start directly */   
+.i3000_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm0, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm3, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx
+       pfadd mm1, mm3
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+                               
+       /* clear total potential and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot],  mm7
+       movq  [esp + fix],    mm7
+       movd  [esp + fiz],    mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  2
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3000_unroll_loop
+       jmp   .i3000_finish_inner
+.i3000_unroll_loop:
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* coulomb table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC. */
+       /* increment vcoul - then we can get rid of mm5. */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3
+       pfmul mm1, [esp + tsc]  
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3000_finish_inner
+       jmp   .i3000_unroll_loop
+.i3000_finish_inner:   
+       and [esp + innerk],  1
+       jnz  .i3000_single_inner
+       jmp  .i3000_updateouterdata             
+.i3000_single_inner:
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+       
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3000_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3000_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i3000_outer
+.i3000_end:
+       femms
+       add esp, 108
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+       
+.globl inl3010_3dnow
+       .type inl3010_3dnow,@function
+inl3010_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56
+.equ           tabscale,       60              
+.equ           VFtab,          64
+.equ           nsatoms,        68              
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           shX,             8
+.equ           shY,            12 
+.equ           shZ,            16      
+.equ           ix,             20
+.equ           iy,             24
+.equ           iz,             28      
+.equ           iq,             32 /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          40 /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            48 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             56 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            64 /* repeated (64bit) to fill 3dnow reg */                     
+.equ           innerjjnr0,     72
+.equ           innerk0,        76              
+.equ           innerjjnr,      80
+.equ           innerk,         84              
+.equ           fix,            88
+.equ           fiy,            92
+.equ           fiz,            96
+.equ           dx1,            100
+.equ           dy1,            104
+.equ           dz1,            108
+.equ           dx2,            112
+.equ           dy2,            116
+.equ           dz2,            120                                                             
+.equ           nscoul,         124
+.equ           solnr,          128             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 132            /* local stack space */
+       femms
+       
+       add   [ebp + nsatoms],  8
+       movq  mm2, [mm_two]
+       movq  [esp + two], mm2
+       movd  mm3, [ebp + tabscale]
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3
+       
+       /* assume we have at least one i particle - start directly */           
+.i3010_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+       movq  [esp + shX], mm0
+       movd  [esp + shZ], mm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   eax, [ebp + nsatoms]
+       mov   ecx, [eax]
+       add   [ebp + nsatoms],  12
+       mov   [esp + nscoul], ecx
+               
+       /* clear potential */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       mov   [esp + solnr], ebx
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]
+       mov   ecx, [esp + nscoul]
+       cmp   ecx,  0
+       jnz  .i3010_mno_coul
+       jmp  .i3010_last_mno
+.i3010_mno_coul:                               
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3010_unroll_coul_loop
+       jmp   .i3010_finish_coul_inner
+.i3010_unroll_coul_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* coulomb table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3
+       pfmul mm1, [esp + tsc]  
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3010_finish_coul_inner
+       jmp   .i3010_unroll_coul_loop
+.i3010_finish_coul_inner:      
+       and [esp + innerk],  1
+       jnz  .i3010_single_coul_inner
+       jmp  .i3010_updateouterdata_coul                
+.i3010_single_coul_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+       
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3010_updateouterdata_coul:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nscoul]
+       jz  .i3010_last_mno
+       jmp .i3010_mno_coul
+.i3010_last_mno:       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3010_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i3010_outer
+.i3010_end:
+       femms
+       add esp, 132
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+
+
+.globl inl3020_3dnow
+       .type inl3020_3dnow,@function
+inl3020_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           tabscale,       60
+.equ           VFtab,          64
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqO,            44  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqH,            52  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqO,            60  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqH,            68  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          76  /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            84  /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             92  /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            100 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      108
+.equ           innerk,         112     
+.equ           fixO,           116
+.equ           fiyO,           120
+.equ           fizO,           124
+.equ           fixH,           128 /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           136 /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           144 /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            152
+.equ           dyO,            156
+.equ           dzO,            160
+.equ           dxH,            164 /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            172 /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            180 /* repeated (64bit) to fill 3dnow reg */
+.equ           tmprsqH,        188 /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 196            /* local stack space */
+       femms
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] */
+       pfmul mm2, mm1          
+       movq  [esp + iqO], mm2          /* iqO = facel*charge[ii] */
+       
+       movd  mm2, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] */
+       pfmul mm2, mm1
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iqH], mm2          /* iqH = facel*charge[i.i0+1] */
+
+       movq  mm3, [mm_two]
+       movd  mm4, [ebp + tabscale]
+       punpckldq mm4,mm4               /* spread to both halves */
+       movq  [esp + two],    mm3
+       movq  [esp + tsc], mm4        
+       /* assume we have at least one i particle - start directly */    
+.i3020_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+                                       
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + fixO],   mm7
+       movd  [esp + fizO],   mm7
+       movq  [esp + fixH],   mm7
+       movq  [esp + fiyH],   mm7
+       movq  [esp + fizH],   mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i3020_inner_loop:     
+       /* a single j particle iteration */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]         /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+       prefetch [ecx + 16]        /* prefetch data - trial and error says 16 is best */
+
+       mov ecx, [ebp + charge]
+       movd mm7, [ecx + eax*4]
+       punpckldq mm7,mm7
+       movq mm6,mm7
+       pfmul mm6, [esp + iqO]
+       pfmul mm7, [esp + iqH]   /* mm6=qqO, mm7=qqH */
+       movd [esp + qqO], mm6
+       movq [esp + qqH], mm7
+               
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+       
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+
+       pfmul mm0, mm1          /* mm0=r */
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqO]  /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqO]  /* fijC=qq*FF */
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       movq mm3, mm7   
+
+       /* change sign of fscal and multiply with rinv */ 
+        pxor mm0,mm0
+       pfsubr mm3, mm0 
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */     
+       
+       /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+       /* now do the two hydrogens. */
+        
+       movq mm0, [esp + tmprsqH] /* mm0=r */sqH
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqH]  /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqH]  /* fijC=qq*FF */
+
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 + 8], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i3020_updateouterdata
+       jmp .i3020_inner_loop
+.i3020_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3, mm3 
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+       
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i3020_end
+       /* not last, iterate once more! */
+       jmp .i3020_outer
+.i3020_end:
+       femms
+       add esp, 196
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+
+.globl inl3030_3dnow
+       .type inl3030_3dnow,@function
+inl3030_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           tabscale,       60
+.equ           VFtab,          64
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,             12
+.equ           izO,             16     
+.equ           ixH,             20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,             28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,             36  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOO,            44  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOH,            52  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqHH,            60  /* repeated (64bit) to fill 3dnow reg */
+.equ           two,             68  /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,              76  /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,             84  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,           92  /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,       100
+.equ           innerk,          104    
+.equ           fixO,            108
+.equ           fiyO,            112
+.equ           fizO,            116
+.equ           fixH,            120 /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,            128 /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,            136 /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,             144
+.equ           dyO,             148
+.equ           dzO,             152
+.equ           dxH,             156 /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,             164 /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,             172 /* repeated (64bit) to fill 3dnow reg */
+.equ           tmprsqH,         180 /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 188            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]        /* mm1=facel */
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] (O) */
+       movd  mm3, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] (H) */ 
+       movq  mm4, mm2  
+       pfmul mm4, mm1
+       movq  mm6, mm3
+       pfmul mm6, mm1
+       movq  mm5, mm4
+       pfmul mm4, mm2                  /* mm4=qqOO*facel */
+       pfmul mm5, mm3                  /* mm5=qqOH*facel */
+       pfmul mm6, mm3                  /* mm6=qqHH*facel */
+       punpckldq mm5,mm5               /* spread to both halves */
+       punpckldq mm6,mm6               /* spread to both halves */
+       movq  [esp + qqOO], mm4
+       movq  [esp + qqOH], mm5
+       movq  [esp + qqHH], mm6
+       movq  mm2, [mm_two]
+       movq  [esp + two], mm2
+       movd  mm3, [ebp + tabscale]
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3
+.i3030_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + fixO],  mm7
+       movq  [esp + fizO],  mm7
+       movq  [esp + fixH],  mm7
+       movq  [esp + fiyH],  mm7
+       movq  [esp + fizH],  mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i3030_inner_loop:
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm0
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */ OO
+       pfmul mm0, mm1          /* mm0=rsq */ OO
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOO] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOO] /* fijC=qq*FF */
+
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       movq mm3, mm7
+
+       /* change sign of fscal and multiply with rinv */ 
+        pxor mm0,mm0
+       pfsubr mm3, mm0 
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+       
+       /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+       /* time for hydrogens! */
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+
+       /* interactions with j H1 */
+
+       movq  mm0, [esi + eax*4 + 12]
+       movd  mm1, [esi + eax*4 + 20]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       pfmul mm0, mm1          /* mm0=rsq */ 
+       
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+       /* update vctot  directly, force is moved to mm3 */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       pxor mm3, mm3
+       pfsub mm3, mm7
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */          
+
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 12]
+       movd mm3,  [edi + eax*4 + 20]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 12], mm2
+       movd [edi + eax*4 + 20], mm3
+
+       /* interactions with j H2 */
+       movq  mm0, [esi + eax*4 + 24]
+       movd  mm1, [esi + eax*4 + 32]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       pfmul mm0, mm1
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       pxor mm3,mm3
+       pfsub mm3, mm7
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4  
+
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 24]
+       movd mm3,  [edi + eax*4 + 32]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 24], mm2
+       movd [edi + eax*4 + 32], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i3030_updateouterdata
+       jmp .i3030_inner_loop   
+.i3030_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i3030_end
+       /* not last, iterate once more! */
+       jmp .i3030_outer
+.i3030_end:
+       femms
+       add esp, 188
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl3100_3dnow
+       .type inl3100_3dnow,@function
+inl3100_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+       /* stack offsets for local variables */
+.equ           is3,             0 
+.equ           ii3,             4
+.equ           ix,              8
+.equ           iy,              12
+.equ           iz,              16
+.equ           iq,              20 /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,           28 /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,          36 /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,              44 /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,             52 /* repeated (64bit) to fill 3dnow reg */
+.equ           six,             60 /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,          68 /* repeated (64bit) to fill 3dnow reg */
+.equ           two,             76 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,              84 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,             92 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,            100
+.equ           innerjjnr,       104
+.equ           innerk,          108    
+.equ           fix,             112
+.equ           fiy,             116
+.equ           fiz,             120
+.equ           dx1,             124
+.equ           dy1,             128
+.equ           dz1,             132
+.equ           dx2,             136
+.equ           dy2,             140
+.equ           dz2,             144                                            
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 148            /* local stack space */
+       femms
+       /* move data to local stack */ 
+       movq  mm0, [mm_two]
+       movq  mm1, [mm_six]
+       movq  mm2, [mm_twelve]
+       movd  mm3, [ebp + tabscale]
+       movq  [esp + two],    mm0
+       movq  [esp + six],    mm1
+       movq  [esp + twelve],    mm2
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3  
+       /* assume we have at least one i particle - start directly */   
+.i3100_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm0, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm3, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx
+       pfadd mm1, mm3
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+                               
+       /* clear total potential and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot],  mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fix],    mm7
+       movd  [esp + fiz],    mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  2
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3100_unroll_loop
+       jmp   .i3100_finish_inner
+.i3100_unroll_loop:
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* coulomb table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       movq mm1, mm0
+       pfmul mm1,mm1   /* mm1=invsq */
+       movq mm2, mm1
+       pfmul mm2,mm1
+       pfmul mm2,mm1   /* mm2=rinvsix */
+       movq  mm1,mm2
+       pfmul mm1,mm1   /* mm1=rinvtwelve */
+       
+       pfmul mm3, [esp + tsc]
+       
+       pfmul mm1, [esp + c12]
+
+       pfmul mm2, [esp + c6]
+
+       movq mm4, mm1
+       pfsub mm4, mm2  /* mm4 = vnb12-vnb6 */
+
+       pfmul mm2, [esp + six]
+       pfmul mm1, [esp + twelve]
+
+       pfsub mm1, mm2
+       pfmul mm1, mm0  /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+       pfsub mm1, mm3
+
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm4, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm4       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3100_finish_inner
+       jmp   .i3100_unroll_loop
+.i3100_finish_inner:   
+       and [esp + innerk],  1
+       jnz  .i3100_single_inner
+       jmp  .i3100_updateouterdata             
+.i3100_single_inner:
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm4=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+       
+       /* at this point mm5 contains vcoul and mm3 fijC */
+
+       movq mm1, mm0
+       pfmul mm1,mm1   /* mm1=invsq */
+       movq mm2, mm1
+       pfmul mm2,mm1
+       pfmul mm2,mm1   /* mm2=rinvsix */
+       movq  mm1,mm2
+       pfmul mm1,mm1   /* mm1=rinvtwelve */
+       
+       pfmul mm3, [esp + tsc]
+       
+       pfmul mm1, [esp + c12]
+
+       pfmul mm2, [esp + c6]
+
+       movq mm4, mm1
+       pfsub mm4, mm2  /* mm4 = vnb12-vnb6 */
+
+       pfmul mm2, [esp + six]
+       pfmul mm1, [esp + twelve]
+
+       pfsub mm1, mm2
+       pfmul mm1, mm0  /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+       pfsub mm1, mm3
+
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm4, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm4       /* store the sum */      
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3100_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix] 
+       pfadd mm7, [esp + fiz] 
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+ 
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb] 
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3100_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i3100_outer
+.i3100_end:
+       femms
+       add esp, 148
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+
+
+
+.globl inl3110_3dnow
+       .type inl3110_3dnow,@function
+inl3110_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+.equ           nsatoms,        84      
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           shX,             8
+.equ           shY,            12 
+.equ           shZ,            16      
+.equ           ix,             20
+.equ           iy,             24
+.equ           iz,             28      
+.equ           iq,             32  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          40  /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         48  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             56  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            64  /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            72  /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         80  /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            88  /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             96  /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            104 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           112
+.equ           innerjjnr0,     116
+.equ           innerk0,        120     
+.equ           innerjjnr,      124
+.equ           innerk,         128     
+.equ           fix,            132
+.equ           fiy,            136
+.equ           fiz,            140
+.equ           dx1,            144
+.equ           dy1,            148
+.equ           dz1,            152
+.equ           dx2,            156
+.equ           dy2,            160
+.equ           dz2,            164                                                             
+.equ           nsvdwc,         168
+.equ           nscoul,         172
+.equ           nsvdw,          176
+.equ           solnr,          180             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 184            /* local stack space */
+       femms
+       movq  mm0, [mm_six]
+       movq  mm1, [mm_twelve]
+       movq  [esp + six],    mm0
+       movq  [esp + twelve], mm1
+       movq  mm2, [mm_two]
+       movd  mm3, [ebp + tabscale]
+       movq  [esp + two],    mm2
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3  
+       /* assume we have at least one i particle - start directly */           
+.i3110_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+       movq  [esp + shX], mm0
+       movd  [esp + shZ], mm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       pxor  mm7,mm7
+       movq  [esp + vctot],  mm7
+       movq  [esp + vnbtot], mm7
+       mov   [esp + solnr],  ebx
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]
+       
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i3110_mno_vdwc
+       jmp   .i3110_testcoul
+.i3110_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3110_unroll_vdwc_loop
+       jmp   .i3110_finish_vdwc_inner
+.i3110_unroll_vdwc_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* coulomb table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       movq mm1, mm0
+       pfmul mm1,mm1   /* mm1=invsq */
+       movq mm2, mm1
+       pfmul mm2,mm1
+       pfmul mm2,mm1   /* mm2=rinvsix */
+       movq  mm1,mm2
+       pfmul mm1,mm1   /* mm1=rinvtwelve */
+       
+       pfmul mm3, [esp + tsc]
+       
+       pfmul mm1, [esp + c12]
+
+       pfmul mm2, [esp + c6]
+
+       movq mm4, mm1
+       pfsub mm4, mm2  /* mm4 = vnb12-vnb6 */
+
+       pfmul mm2, [esp + six]
+       pfmul mm1, [esp + twelve]
+
+       pfsub mm1, mm2
+       pfmul mm1, mm0  /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+       pfsub mm1, mm3
+
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm4, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm4       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7     
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3110_finish_vdwc_inner
+       jmp   .i3110_unroll_vdwc_loop
+.i3110_finish_vdwc_inner:      
+       and [esp + innerk],  1
+       jnz  .i3110_single_vdwc_inner
+       jmp  .i3110_updateouterdata_vdwc                
+.i3110_single_vdwc_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm4=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       movq mm1, mm0
+       pfmul mm1,mm1   /* mm1=invsq */
+       movq mm2, mm1
+       pfmul mm2,mm1
+       pfmul mm2,mm1   /* mm2=rinvsix */
+       movq  mm1,mm2
+       pfmul mm1,mm1   /* mm1=rinvtwelve */
+       
+       pfmul mm3, [esp + tsc]
+       
+       pfmul mm1, [esp + c12]
+
+       pfmul mm2, [esp + c6]
+
+       movq mm4, mm1
+       pfsub mm4, mm2  /* mm4 = vnb12-vnb6 */
+
+       pfmul mm2, [esp + six]
+       pfmul mm1, [esp + twelve]
+
+       pfsub mm1, mm2
+       pfmul mm1, mm0  /* mm1= (12*vnb12-6*vnb6)*rinv11 */
+
+       pfsub mm1, mm3
+
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm4, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm4       /* store the sum */      
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3110_updateouterdata_vdwc:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdwc]
+       jz  .i3110_testcoul
+       jmp .i3110_mno_vdwc
+.i3110_testcoul:       
+       mov  ecx, [esp + nscoul]
+       cmp  ecx,  0
+       jnz  .i3110_mno_coul
+       jmp  .i3110_testvdw
+.i3110_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3110_unroll_coul_loop
+       jmp   .i3110_finish_coul_inner
+.i3110_unroll_coul_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* coulomb table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3
+       pfmul mm1, [esp + tsc]  
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3110_finish_coul_inner
+       jmp   .i3110_unroll_coul_loop
+.i3110_finish_coul_inner:      
+       and [esp + innerk],  1
+       jnz  .i3110_single_coul_inner
+       jmp  .i3110_updateouterdata_coul                
+.i3110_single_coul_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+       
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3110_updateouterdata_coul:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nscoul]
+       jz  .i3110_testvdw
+       jmp .i3110_mno_coul
+.i3110_testvdw:        
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i3110_mno_vdw
+       jmp  .i3110_last_mno
+.i3110_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3110_unroll_vdw_loop
+       jmp   .i3110_finish_vdw_inner
+.i3110_unroll_vdw_loop:        
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */  
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrcp mm0, mm4                  /* lookup reciprocal seed */ 
+        pfrcp mm1, mm6
+ 
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+                                       /* amd 3dnow N-R iteration to get full precision. */
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       
+       /* mm4 now contains invsq,
+        * do potential and fscal 
+        */
+       movq  mm0, mm4
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5,mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3110_finish_vdw_inner
+       jmp   .i3110_unroll_vdw_loop
+.i3110_finish_vdw_inner:       
+       and [esp + innerk],  1
+       jnz  .i3110_single_vdw_inner
+       jmp  .i3110_updateouterdata_vdw         
+.i3110_single_vdw_inner:       
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm4=rsq */
+       
+        pfrcp mm0,mm4
+        pfrcpit1 mm4,mm0                               
+        pfrcpit2 mm4,mm0       /* mm4=invsq */ 
+       /* calculate potentials and scalar force */
+       movq  mm0, mm4
+
+       pfmul mm4, mm0
+       pfmul mm4, mm0                  /* mm4=rinvsix */
+       movq  mm5, mm4  
+       pfmul mm5, mm5                  /* mm5=rinvtwelve */
+
+       pfmul mm5, [esp + c12]
+       pfmul mm4, [esp + c6]   
+       movq mm6, mm5   /* mm6 is vnb12-vnb6 */ 
+       pfsub mm6, mm4
+
+       pfmul mm4, [esp + six]
+
+       pfmul mm5, [esp + twelve]
+       pfsub mm5, mm4
+       pfmul mm0, mm5        /* mm0 is total fscal now */
+
+       /* update vnbtot */
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3110_updateouterdata_vdw:    
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i3110_last_mno
+       jmp .i3110_mno_vdw
+       
+.i3110_last_mno:       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3110_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i3110_outer
+.i3110_end:
+       femms
+       add esp, 184
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+       
+
+.globl inl3120_3dnow
+       .type inl3120_3dnow,@function
+inl3120_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqO,            44  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqH,            52  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqO,            60  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqH,            68  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          76  /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         84  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             92  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            100 /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            108 /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         116 /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            124 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             132 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            140 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           148 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      156
+.equ           innerk,         160     
+.equ           fixO,           164
+.equ           fiyO,           168
+.equ           fizO,           172
+.equ           fixH,           176 /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           184 /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           192 /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            200
+.equ           dyO,            204
+.equ           dzO,            208
+.equ           dxH,            212 /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            220 /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            228 /* repeated (64bit) to fill 3dnow reg */
+.equ           tmprsqH,        236 /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 244            /* local stack space */
+       femms
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] */
+       pfmul mm2, mm1
+       movq  [esp + iqO], mm2          /* iqO = facel*charge[ii] */
+       
+       movd  mm2, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] */
+       pfmul mm2, mm1
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iqH], mm2          /* iqH = facel*charge[i.i0+1] */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]
+       shl   edx, 1
+       mov   ecx, edx                  
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[i.i0] */ 
+       mov   [esp + ntia], ecx
+               
+       movq  mm3, [mm_two]
+       movq  mm4, [mm_six]
+       movq  mm5, [mm_twelve]
+       movq  mm6, [ebp + tabscale]
+       punpckldq mm6,mm6               /* spread to both halves */
+       movq  [esp + two], mm3
+       movq  [esp + six], mm4
+       movq  [esp + twelve], mm5
+       movq  [esp + tsc], mm6        
+       /* assume we have at least one i particle - start directly */   
+.i3120_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+                                       
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fixO],   mm7
+       movd  [esp + fizO],   mm7
+       movq  [esp + fixH],   mm7
+       movq  [esp + fiyH],   mm7
+       movq  [esp + fizH],   mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i3120_inner_loop:     
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+       prefetch [ecx + 16]        /* prefetch data - trial and error says 16 is best */
+
+       mov ecx, [ebp + charge]
+       movd mm7, [ecx + eax*4]
+       punpckldq mm7,mm7
+       movq mm6,mm7
+       pfmul mm6, [esp + iqO]
+       pfmul mm7, [esp + iqH]  /* mm6=qqO, mm7=qqH */
+       movd [esp + qqO], mm6
+       movq [esp + qqH], mm7
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr] */
+       mov ecx, [ebp + nbfp]
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [ecx + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [ecx + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5   
+                       
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+       
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+
+       pfmul mm0, mm1          /* mm0=r */
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqO]  /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqO]  /* fijC=qq*FF */
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+
+       movq mm3, mm7
+       pfmul mm3, [esp + tsc]
+       
+       /* nontabulated LJ - mm1 is invsqrt. - keep mm1! */
+       movq mm0, mm1
+       pfmul mm0, mm0          /* mm0 is invsq */
+       movq mm2, mm0
+       pfmul mm2, mm0
+       pfmul mm2, mm0          /* mm2 = rinvsix */
+       movq mm4, mm2
+       pfmul mm4, mm4          /* mm4=rinvtwelve */
+
+       pfmul mm4, [esp + c12]
+       pfmul mm2, [esp + c6]
+       movq mm5, mm4
+       pfsub mm5, mm2          /* mm5=vnb12-vnb6 */
+
+       pfmul mm2, [esp + six]
+       pfmul mm4, [esp + twelve]
+       pfsub mm4, mm2
+       pfmul mm4, mm1        /* mm4=(12*vnb12-6*vnb6)*rinv11 */
+
+       pfsubr mm3, mm4 
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+       
+       /* update vnbtot */ 
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+       
+       /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+       /* now do the two hydrogens. */
+       movq mm0, [esp + tmprsqH] /* mm0=r */sqH
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqH]  /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqH]  /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7
+       pfmul mm4, [esp + tsc]  
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i3120_updateouterdata
+       jmp .i3120_inner_loop
+.i3120_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+       
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* same for Vnb */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i3120_end
+       /* not last, iterate once more! */
+       jmp .i3120_outer
+.i3120_end:
+       femms
+       add esp, 244
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+
+.globl inl3130_3dnow
+       .type inl3130_3dnow,@function
+inl3130_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOO,           44  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOH,           52  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqHH,           60  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             68  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            76  /* repeated (64bit) to fill 3dnow reg */
+.equ           six,            84  /* repeated (64bit) to fill 3dnow reg */
+.equ           twelve,         92  /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            100 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             108 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            116 /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          124 /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         132 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      140
+.equ           innerk,         144     
+.equ           fixO,           148
+.equ           fiyO,           152
+.equ           fizO,           156
+.equ           fixH,           160 /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           168 /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           176 /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            184
+.equ           dyO,            188
+.equ           dzO,            192
+.equ           dxH,            200 /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            208 /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            216 /* repeated (64bit) to fill 3dnow reg */
+.equ           tmprsqH,        224 /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 232            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]        /* mm1=facel */
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] (O) */
+       movd  mm3, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] (H) */ 
+       movq  mm4, mm2  
+       pfmul mm4, mm1
+       movq  mm6, mm3
+       pfmul mm6, mm1
+       movq  mm5, mm4
+       pfmul mm4, mm2                  /* mm4=qqOO*facel */
+       pfmul mm5, mm3                  /* mm5=qqOH*facel */
+       pfmul mm6, mm3                  /* mm6=qqHH*facel */
+       punpckldq mm5,mm5               /* spread to both halves */
+       punpckldq mm6,mm6               /* spread to both halves */
+       movq  [esp + qqOO], mm4
+       movq  [esp + qqOH], mm5
+       movq  [esp + qqHH], mm6
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       mov   edx, ecx
+       imul  ecx, [ebp + ntype]
+       add   edx, ecx
+       mov   eax, [ebp + nbfp]
+       movd  mm0, [eax + edx*4]
+       movd  mm1, [eax + edx*4 + 4]
+       movq  [esp + c6], mm0
+       movq  [esp + c12], mm1
+       movq  mm2, [mm_two]
+       movq  mm3, [mm_six]
+       movq  mm4, [mm_twelve]
+       movq  [esp + two], mm2
+       movq  [esp + six], mm3
+       movq  [esp + twelve], mm4
+       movd  mm5, [ebp + tabscale]
+       punpckldq mm5,mm5
+       movq  [esp + tsc], mm5
+.i3130_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fixO],  mm7
+       movq  [esp + fizO],  mm7
+       movq  [esp + fixH],  mm7
+       movq  [esp + fiyH],  mm7
+       movq  [esp + fizH],  mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i3130_inner_loop:
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm0
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */ OO
+       pfmul mm0, mm1          /* mm0=rsq */ OO
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOO] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOO] /* fijC=qq*FF */
+
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       movq mm3, mm7
+       pfmul mm3, [esp + tsc]
+       
+       movq mm5, mm1
+       pfmul mm5,mm5
+       movq mm4, mm5
+       pfmul mm4,mm5
+       pfmul mm4,mm5
+       movq mm5, mm4
+       pfmul mm5,mm5   /* mm4=rinvsix, mm5=rinvtwelve */
+
+       pfmul mm4, [esp + c6]
+       pfmul mm5, [esp + c12]
+       movq mm6,mm5
+       pfsub mm6,mm4
+
+       pfmul mm4, [esp + six]
+       pfmul mm5, [esp + twelve]
+       pfsub mm5,mm4
+       pfmul mm5, mm1
+       pfsubr mm3, mm5
+
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+       
+       /* update vnbtot */ 
+       pfadd mm6, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm6       /* store the sum */      
+       
+       /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+       /* time for hydrogens! */
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+
+       /* interactions with j H1 */
+
+       movq  mm0, [esi + eax*4 + 12]
+       movd  mm1, [esi + eax*4 + 20]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       pfmul mm0, mm1          /* mm0=rsq */ 
+       
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+       /* update vctot  directly, force is moved to mm3 */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       pxor mm3, mm3
+       pfsub mm3, mm7
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */          
+
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 12]
+       movd mm3,  [edi + eax*4 + 20]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 12], mm2
+       movd [edi + eax*4 + 20], mm3
+
+       /* interactions with j H2 */
+       movq  mm0, [esi + eax*4 + 24]
+       movd  mm1, [esi + eax*4 + 32]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       pfmul mm0, mm1
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       pxor mm3,mm3
+       pfsub mm3, mm7
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4  
+
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 24]
+       movd mm3,  [edi + eax*4 + 32]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 24], mm2
+       movd [edi + eax*4 + 32], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i3130_updateouterdata
+       jmp .i3130_inner_loop   
+.i3130_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnbtot[gid] */
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i3130_end
+       /* not last, iterate once more! */
+       jmp .i3130_outer
+.i3130_end:
+       femms
+       add esp, 232
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+.globl inl3300_3dnow
+       .type inl3300_3dnow,@function
+inl3300_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ix,              8
+.equ           iy,             12
+.equ           iz,             16
+.equ           iq,             20  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          28  /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         36  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             44  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            52  /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            60  /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             68  /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            76  /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           84
+.equ           innerjjnr,      88
+.equ           innerk,         92              
+.equ           fix,            96
+.equ           fiy,            100
+.equ           fiz,            104
+.equ           dx1,            108
+.equ           dy1,            112
+.equ           dz1,            116
+.equ           dx2,            120
+.equ           dy2,            124
+.equ           dz2,            128                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 132            /* local stack space */
+       femms
+       /* move data to local stack */ 
+       movq  mm0, [mm_two]
+       movd  mm3, [ebp + tabscale]
+       movq  [esp + two],    mm0
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3  
+       /* assume we have at least one i particle - start directly */   
+.i3300_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm0, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm3, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx
+       pfadd mm1, mm3
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+                               
+       /* clear total potential and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot],  mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fix],    mm7
+       movd  [esp + fiz],    mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  2
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3300_unroll_loop
+       jmp   .i3300_finish_inner
+.i3300_unroll_loop:
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       /* dispersion table */
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4 + 16]
+       punpckldq mm5, [edx + ecx*4 + 20]
+       punpckldq mm6, [edx + ecx*4 + 24]
+       punpckldq mm7, [edx + ecx*4 + 28]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       pfadd mm3, mm7  /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 32]
+       movd mm5, [edx + ecx*4 + 36]
+       movd mm6, [edx + ecx*4 + 40]
+       movd mm7, [edx + ecx*4 + 44]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4 + 32]
+       punpckldq mm5, [edx + ecx*4 + 36]
+       punpckldq mm6, [edx + ecx*4 + 40]
+       punpckldq mm7, [edx + ecx*4 + 44]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3300_finish_inner
+       jmp   .i3300_unroll_loop
+.i3300_finish_inner:   
+       and [esp + innerk],  1
+       jnz  .i3300_single_inner
+       jmp  .i3300_updateouterdata             
+.i3300_single_inner:
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+       
+       /* dispersion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       pfadd mm3, mm7  /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 32]
+       movd mm5, [edx + ecx*4 + 36]
+       movd mm6, [edx + ecx*4 + 40]
+       movd mm7, [edx + ecx*4 + 44]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3300_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3300_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i3300_outer
+.i3300_end:
+       femms
+       add esp, 132
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+
+.globl inl3310_3dnow
+       .type inl3310_3dnow,@function
+inl3310_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+.equ           nsatoms,        84              
+       /* stack offsets for local variables */
+.equ           is3,            0
+.equ           ii3,            4
+.equ           shX,            8
+.equ           shY,            12 
+.equ           shZ,            16      
+.equ           ix,             20
+.equ           iy,             24
+.equ           iz,             28      
+.equ           iq,             32  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          40  /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         48  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             56  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            64  /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            72  /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             80  /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            88  /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           96      
+.equ           innerjjnr0,     100
+.equ           innerk0,        104     
+.equ           innerjjnr,      108
+.equ           innerk,         112     
+.equ           fix,            116
+.equ           fiy,            120
+.equ           fiz,            124
+.equ           dx1,            128
+.equ           dy1,            132
+.equ           dz1,            136
+.equ           dx2,            140
+.equ           dy2,            144
+.equ           dz2,            148                                                             
+.equ           nsvdwc,         152
+.equ           nscoul,         156
+.equ           nsvdw,          160
+.equ           solnr,          164             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 168            /* local stack space */
+       femms
+       movq  mm0, [mm_two]
+       movd  mm3, [ebp + tabscale]
+       movq  [esp + two],    mm0
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3  
+       /* assume we have at least one i particle - start directly */           
+.i3310_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm0, [eax + ebx*4]        /* move shX/shY to mm0 and shZ to mm1 */
+       movd  mm1, [eax + ebx*4 + 8]
+       movq  [esp + shX], mm0
+       movd  [esp + shZ], mm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       pxor  mm7,mm7
+       movq  [esp + vctot],  mm7
+       movq  [esp + vnbtot], mm7
+       mov   [esp + solnr],  ebx
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]
+       
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i3310_mno_vdwc
+       jmp   .i3310_testcoul
+.i3310_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3310_unroll_vdwc_loop
+       jmp   .i3310_finish_vdwc_inner
+.i3310_unroll_vdwc_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6,mm5                    
+       punpckldq mm5,mm7               /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6,mm7               /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       /* dispersion table */
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4 + 16]
+       punpckldq mm5, [edx + ecx*4 + 20]
+       punpckldq mm6, [edx + ecx*4 + 24]
+       punpckldq mm7, [edx + ecx*4 + 28]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       pfadd mm3, mm7  /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 32]
+       movd mm5, [edx + ecx*4 + 36]
+       movd mm6, [edx + ecx*4 + 40]
+       movd mm7, [edx + ecx*4 + 44]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4 + 32]
+       punpckldq mm5, [edx + ecx*4 + 36]
+       punpckldq mm6, [edx + ecx*4 + 40]
+       punpckldq mm7, [edx + ecx*4 + 44]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3310_finish_vdwc_inner
+       jmp   .i3310_unroll_vdwc_loop
+.i3310_finish_vdwc_inner:      
+       and [esp + innerk],  1
+       jnz  .i3310_single_vdwc_inner
+       jmp  .i3310_updateouterdata_vdwc                
+.i3310_single_vdwc_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+       
+       /* dispersion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       pfadd mm3, mm7  /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 32]
+       movd mm5, [edx + ecx*4 + 36]
+       movd mm6, [edx + ecx*4 + 40]
+       movd mm7, [edx + ecx*4 + 44]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3310_updateouterdata_vdwc:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdwc]
+       jz  .i3310_testcoul
+       jmp .i3310_mno_vdwc
+.i3310_testcoul:       
+       mov  ecx, [esp + nscoul]
+       cmp  ecx,  0
+       jnz  .i3310_mno_coul
+       jmp  .i3310_testvdw
+.i3310_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+       mov   edx, [ebp + charge]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[ii] */
+       pfmul mm2, [ebp + facel]
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iq], mm2           /* iq =facel*charge[ii] */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3310_unroll_coul_loop
+       jmp   .i3310_finish_coul_inner
+.i3310_unroll_coul_loop:       
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + charge]        /* base of charge[] */
+       movq mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]          /* charge[jnr1] */
+        punpckldq mm3, [ecx + ebx*4]     /* move charge 2 to high part of mm3 */
+       pfmul mm3,mm5                    /* mm3 now has qq for both particles */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6,mm0                   /* dr = ir - jr */ 
+       pfsubr mm7,mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6,mm6                    /* square dx,dy,dz */
+       pfmul mm7,mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0,mm1
+       punpckldq mm4,mm6               /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2,mm0                   /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* coulomb table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3
+       pfmul mm1, [esp + tsc]  
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3310_finish_coul_inner
+       jmp   .i3310_unroll_coul_loop
+.i3310_finish_coul_inner:      
+       and [esp + innerk],  1
+       jnz  .i3310_single_coul_inner
+       jmp  .i3310_updateouterdata_coul                
+.i3310_single_coul_inner:      
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov ecx, [ebp + charge]
+       movd mm5, [esp + iq]
+       movd mm3, [ecx + eax*4]
+       pfmul mm3, mm5          /* mm3=qq */
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, mm3  /* vcoul=qq*VV */
+       pfmul mm3, mm7  /* fijC=FF*qq */ 
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]      /* add the earlier value */
+       movq [esp + vctot], mm5       /* store the sum */      
+       
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3310_updateouterdata_coul:   
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nscoul]
+       jz  .i3310_testvdw
+       jmp .i3310_mno_coul
+.i3310_testvdw:        
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i3310_mno_vdw
+       jmp  .i3310_last_mno
+.i3310_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + type]         
+       mov   edx, [edx + ebx*4]        
+       imul  edx, [ebp + ntype]
+       shl   edx, 1
+       mov   [esp + ntia], edx 
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       mov   [esp + ii3], ebx
+       
+       movq  mm0, [eax + ebx*4]
+       movd  mm1, [eax + ebx*4 + 8]
+       pfadd mm0, [esp + shX]
+       pfadd mm1, [esp + shZ]
+       movq  [esp + ix], mm0   
+       movd  [esp + iz], mm1   
+
+       /* clear forces */
+       pxor  mm7,mm7
+       movq  [esp + fix],   mm7
+       movd  [esp + fiz],   mm7
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  2
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3310_unroll_vdw_loop
+       jmp   .i3310_finish_vdw_inner
+.i3310_unroll_vdw_loop:        
+       /* paired innerloop starts here */
+       mov   ecx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]             /* eax/ebx=jnr */
+       add   [esp + innerjjnr],  8 /* advance pointer (unrolled 2) */
+       prefetch [ecx + 16]              /* prefetch data - trial and error says 16 is best */
+       
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       mov ecx, [ecx + ebx*4]           /* type [jnr2] */
+
+       mov esi, [ebp + nbfp]           /* base of nbfp */ 
+       shl edx, 1
+       shl ecx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       add ecx, [esp + ntia]
+
+       movq mm5, [esi + edx*4]         /* mm5 = 1st c6 / c12 */                
+       movq mm7, [esi + ecx*4]         /* mm7 = 2nd c6 / c12 */        
+       movq mm6, mm5                   
+       punpckldq mm5, mm7              /* mm5 = 1st c6 / 2nd c6 */
+       punpckhdq mm6, mm7              /* mm6 = 1st c12 / 2nd c12 */
+       movq [esp + c6], mm5
+       movq [esp + c12], mm6
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]                
+
+       mov   esi, [ebp + pos]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]           
+       movq  mm4, [esi + eax*4]         /* fetch first j coordinates */
+       movd  mm5, [esi + eax*4 + 8]            
+       pfsubr mm4,mm0                   /* dr = ir - jr */ 
+       pfsubr mm5,mm1
+       movq  [esp + dx1], mm4           /* store dr */
+       movd  [esp + dz1], mm5
+       pfmul mm4,mm4                    /* square dx,dy,dz */                   
+       pfmul mm5,mm5           
+       pfacc mm4, mm5                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm4, mm5                   /* first rsq in lower mm4 */
+
+       movq  mm6, [esi + ebx*4]         /* fetch second j coordinates */ 
+       movd  mm7, [esi + ebx*4 + 8]
+       
+       pfsubr mm6, mm0                  /* dr = ir - jr */ 
+       pfsubr mm7, mm1
+       movq  [esp + dx2], mm6           /* store dr */
+       movd  [esp + dz2], mm7
+       pfmul mm6, mm6                   /* square dx,dy,dz */
+       pfmul mm7, mm7
+       pfacc mm6, mm7                   /* accumulate to get dx*dx+dy*dy+dz*dz */
+       pfacc mm6, mm7                   /* second rsq in lower mm6 */
+
+        pfrsqrt mm0, mm4                /* lookup inverse square root seed */
+        pfrsqrt mm1, mm6
+ 
+
+       punpckldq mm0, mm1
+       punpckldq mm4, mm6              /* now 4 has rsq and 0 the seed for both pairs. */
+        movq mm2, mm0                  /* amd 3dnow N-R iteration to get full precision. */
+       pfmul mm0, mm0
+        pfrsqit1 mm0, mm4                              
+        pfrcpit2 mm0, mm2      
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+       /* do potential and fscal */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4, mm1
+       movq [esp + n1], mm4
+       pi2fd mm4, mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 is n0 */
+       
+       movq mm2, mm1
+       pfmul mm2, mm2  /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       /* dispersion table */
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]  
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4 + 16]
+       punpckldq mm5, [edx + ecx*4 + 20]
+       punpckldq mm6, [edx + ecx*4 + 24]
+       punpckldq mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */Toggle
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm1, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       prefetchw [esp + dx1]   /* prefetch i forces to cache */
+
+       /* spread fscalar to both positions */
+       movq mm1,mm0
+       punpckldq mm0,mm0
+       punpckhdq mm1,mm1
+
+       /* calc vector force */
+       prefetchw [edi + eax*4] /* prefetch the 1st faction to cache */
+       movq mm2,  [esp + dx1]  /* fetch dr */
+       movd mm3,  [esp + dz1]
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       prefetchw [edi + ebx*4] /* prefetch the 2nd faction to cache */
+       pfmul mm2, mm0          /* mult by fs */ 
+       pfmul mm3, mm0
+
+       movq mm4,  [esp + dx2]  /* fetch dr */
+       movd mm5,  [esp + dz2]
+       pfmul mm4, mm1          /* mult by fs */ 
+       pfmul mm5, mm1
+       /* update i forces */
+
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+
+       pfadd mm0, mm4
+       pfadd mm1, mm5
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j forces */
+
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax*4 + 8]
+       movq mm6,  [edi + ebx*4]
+       movd mm7,  [edi + ebx*4 + 8]
+       
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       pfsub mm6, mm4
+       pfsub mm7, mm5
+       
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       movq [edi + ebx*4], mm6
+       movd [edi + ebx*4 + 8], mm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  2
+       jl    .i3310_finish_vdw_inner
+       jmp   .i3310_unroll_vdw_loop
+.i3310_finish_vdw_inner:       
+       and [esp + innerk],  1
+       jnz  .i3310_single_vdw_inner
+       jmp  .i3310_updateouterdata_vdw         
+.i3310_single_vdw_inner:       
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+
+       mov esi, [ebp + nbfp]
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr1] */
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [esi + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [esi + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5
+
+       mov   esi, [ebp + pos]
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esp + ix]
+       movd  mm1, [esp + iz]
+       movq  mm4, [esi + eax*4]
+       movd  mm5, [esi + eax*4 + 8]
+       pfsubr mm4, mm0
+       pfsubr mm5, mm1
+       movq  [esp + dx1], mm4
+       pfmul mm4,mm4
+       movd  [esp + dz1], mm5  
+       pfmul mm5,mm5
+       pfacc mm4, mm5
+       pfacc mm4, mm5          /* mm0=rsq */
+       
+        pfrsqrt mm0,mm4
+        movq mm2,mm0
+        pfmul mm0,mm0
+        pfrsqit1 mm0,mm4                               
+        pfrcpit2 mm0,mm2       /* mm1=invsqrt */
+       pfmul mm4, mm0
+       movq mm1, mm4
+       /* mm0 is invsqrt, and mm1 r. */
+
+       /* calculate potentials and scalar force */
+       pfmul mm1, [esp + tsc]  /* mm1=rt */
+       pf2iw mm4,mm1
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm1, mm4                   /* now mm1 is eps and mm4 n0. */
+
+       movq mm2,mm1
+       pfmul mm2,mm2   /* mm1 is eps, mm2 is eps2 */
+       
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]  
+       shl ecx, 2
+       /* dispersion table
+        * load all the table values we need
+        */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       movq mm3, mm7   /* add to fscal */
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table
+        * load all the table values we need
+        */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+
+       pfmul mm6, mm1  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm1  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of mm3 */
+        pxor mm1,mm1
+       pfsub mm1, mm3  
+       pfmul mm0, [esp + tsc]
+       pfmul mm0, mm1        /* mm0 is total fscal now */      
+
+       /* update vnbtot */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* spread fscalar to both positions */
+       punpckldq mm0,mm0
+       /* calc vectorial force */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm2,  [esp + dx1]
+       movd mm3,  [esp + dz1]
+
+       pfmul mm2, mm0
+       pfmul mm3, mm0
+
+       /* update i particle force */
+       movq mm0,  [esp + fix]
+       movd mm1,  [esp + fiz]
+       pfadd mm0, mm2
+       pfadd mm1, mm3
+       movq [esp + fix], mm0
+       movd [esp + fiz], mm1
+       /* update j particle force */
+       movq mm0,  [edi + eax*4]
+       movd mm1,  [edi + eax *4+ 8]
+       pfsub mm0, mm2
+       pfsub mm1, mm3
+       movq [edi + eax*4], mm0
+       movd [edi + eax*4 +8], mm1
+       /* done! */
+.i3310_updateouterdata_vdw:    
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment i force */
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fix]
+       pfadd mm7, [esp + fiz]
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i3310_last_mno
+       jmp .i3310_mno_vdw
+       
+.i3310_last_mno:       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3310_end
+       /* not last, iterate once more! */
+       mov [ebp + nri], ecx
+       jmp .i3310_outer
+.i3310_end:
+       femms
+       add esp, 168
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+.globl inl3320_3dnow
+       .type inl3320_3dnow,@function
+inl3320_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqO,            44  /* repeated (64bit) to fill 3dnow reg */
+.equ           iqH,            52  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqO,            60  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqH,            68  /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          76  /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         84  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             92  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            100 /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            108 /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             116 /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            124 /* repeated (64bit) to fill 3dnow reg */
+.equ           ntia,           132 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      140
+.equ           innerk,         144     
+.equ           fixO,           148
+.equ           fiyO,           152
+.equ           fizO,           156
+.equ           fixH,           160 /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           168 /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           176 /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            184
+.equ           dyO,            188
+.equ           dzO,            192
+.equ           dxH,            196 /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            204 /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            212 /* repeated (64bit) to fill 3dnow reg */
+.equ           tmprsqH,        220 /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 228            /* local stack space */
+       femms
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] */
+       pfmul mm2, mm1          
+       movq  [esp + iqO], mm2          /* iqO = facel*charge[ii] */
+       
+       movd  mm2, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] */
+       pfmul mm2, mm1
+       punpckldq mm2,mm2               /* spread to both halves */
+       movq  [esp + iqH], mm2          /* iqH = facel*charge[i.i0+1] */
+
+       mov   edx, [ebp + type]         
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1                    
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[i.i0] */ 
+       mov   [esp + ntia], ecx
+               
+       movq  mm3, [mm_two]
+       movq  mm4, [ebp + tabscale]
+       punpckldq mm4,mm4               /* spread to both halves */
+       movq  [esp + two],    mm3
+       movq  [esp + tsc], mm4        
+       /* assume we have at least one i particle - start directly */    
+.i3320_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+       
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+                                       
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fixO],   mm7
+       movd  [esp + fizO],   mm7
+       movq  [esp + fixH],   mm7
+       movq  [esp + fiyH],   mm7
+       movq  [esp + fizH],   mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i3320_inner_loop:     
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+       prefetch [ecx + 16]        /* prefetch data - trial and error says 16 is best */
+
+       mov ecx, [ebp + charge]
+       movd mm7, [ecx + eax*4]
+       punpckldq mm7,mm7
+       movq mm6,mm7
+       pfmul mm6, [esp + iqO]
+       pfmul mm7, [esp + iqH]  /* mm6=qqO, mm7=qqH */
+       movd [esp + qqO], mm6
+       movq [esp + qqH], mm7
+
+       mov ecx, [ebp + type]
+       mov edx, [ecx + eax*4]           /* type [jnr] */
+       mov ecx, [ebp + nbfp]
+       shl edx, 1
+       add edx, [esp + ntia]            /* tja = ntia + 2*type */
+       movd mm5, [ecx + edx*4]         /* mm5 = 1st c6 */              
+       movq [esp + c6], mm5
+       movd mm5, [ecx + edx*4 + 4]     /* mm5 = 1st c12 */             
+       movq [esp + c12], mm5   
+                       
+       lea   eax, [eax + eax*2]
+       
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+       
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+
+       pfmul mm0, mm1          /* mm0=r */
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqO]  /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqO]  /* fijC=qq*FF */
+
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       movq mm3, mm7
+       
+       /* dispersion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       pfadd mm3, mm7  /* add to fscal */ 
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 32]
+       movd mm5, [edx + ecx*4 + 36]
+       movd mm6, [edx + ecx*4 + 40]
+       movd mm7, [edx + ecx*4 + 44]
+
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of fscal and multiply with rinv */ 
+        pxor mm0,mm0
+       pfsubr mm3, mm0 
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+       
+       /* update vnbtot */ 
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+       
+       /* Ready with the oxygen - potential is updated, fscal is in mm3. */
+       /* now do the two hydrogens. */
+       movq mm0, [esp + tmprsqH] /* mm0=r */sqH
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqH]  /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqH]  /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       prefetchw [edi + eax*4] /* prefetch faction to cache */ 
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i3320_updateouterdata
+       jmp .i3320_inner_loop
+.i3320_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+       
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+       
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* same for Vnb */
+       
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnb[gid] */
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i3320_end
+       /* not last, iterate once more! */
+       jmp .i3320_outer
+.i3320_end:
+       femms
+       add esp, 228
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+
+.globl inl3330_3dnow
+       .type inl3330_3dnow,@function
+inl3330_3dnow: 
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+                       /* stack offsets for local variables */
+.equ           is3,             0
+.equ           ii3,             4
+.equ           ixO,             8
+.equ           iyO,            12
+.equ           izO,            16      
+.equ           ixH,            20  /* repeated (64bit) to fill 3dnow reg */
+.equ           iyH,            28  /* repeated (64bit) to fill 3dnow reg */
+.equ           izH,            36  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOO,           44  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqOH,           52  /* repeated (64bit) to fill 3dnow reg */
+.equ           qqHH,           60  /* repeated (64bit) to fill 3dnow reg */
+.equ           c6,             68  /* repeated (64bit) to fill 3dnow reg */
+.equ           c12,            76  /* repeated (64bit) to fill 3dnow reg */
+.equ           two,            84  /* repeated (64bit) to fill 3dnow reg */
+.equ           n1,             92  /* repeated (64bit) to fill 3dnow reg */
+.equ           tsc,            100 /* repeated (64bit) to fill 3dnow reg */
+.equ           vctot,          108 /* repeated (64bit) to fill 3dnow reg */
+.equ           vnbtot,         116 /* repeated (64bit) to fill 3dnow reg */
+.equ           innerjjnr,      124
+.equ           innerk,         128     
+.equ           fixO,           132
+.equ           fiyO,           136
+.equ           fizO,           140
+.equ           fixH,           144 /* repeated (64bit) to fill 3dnow reg */
+.equ           fiyH,           152 /* repeated (64bit) to fill 3dnow reg */
+.equ           fizH,           160 /* repeated (64bit) to fill 3dnow reg */
+.equ           dxO,            168
+.equ           dyO,            172
+.equ           dzO,            176
+.equ           dxH,            180  /* repeated (64bit) to fill 3dnow reg */
+.equ           dyH,            188  /* repeated (64bit) to fill 3dnow reg */
+.equ           dzH,            196  /* repeated (64bit) to fill 3dnow reg */
+.equ           tmprsqH,        204  /* repeated (64bit) to fill 3dnow reg */
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 212            /* local stack space */
+       femms
+       /* assume we have at least one i particle - start directly */   
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       mov   edx, [ebp + charge]
+       movd  mm1, [ebp + facel]        /* mm1=facel */
+       movd  mm2, [edx + ebx*4]        /* mm2=charge[i.i0] (O) */
+       movd  mm3, [edx + ebx*4 + 4]    /* mm2=charge[i.i0+1] (H) */ 
+       movq  mm4, mm2  
+       pfmul mm4, mm1
+       movq  mm6, mm3
+       pfmul mm6, mm1
+       movq  mm5, mm4
+       pfmul mm4, mm2                  /* mm4=qqOO*facel */
+       pfmul mm5, mm3                  /* mm5=qqOH*facel */
+       pfmul mm6, mm3                  /* mm6=qqHH*facel */
+       punpckldq mm5,mm5               /* spread to both halves */
+       punpckldq mm6,mm6               /* spread to both halves */
+       movq  [esp + qqOO], mm4
+       movq  [esp + qqOH], mm5
+       movq  [esp + qqHH], mm6
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       mov   edx, ecx
+       imul  ecx, [ebp + ntype]
+       add   edx, ecx
+       mov   eax, [ebp + nbfp]
+       movd  mm0, [eax + edx*4]
+       movd  mm1, [eax + edx*4 + 4]
+       movq  [esp + c6], mm0
+       movq  [esp + c12], mm1
+       movq  mm2, [mm_two]
+       movq  [esp + two], mm2
+       movd  mm3, [ebp + tabscale]
+       punpckldq mm3,mm3
+       movq  [esp + tsc], mm3
+.i3330_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+       
+       movq  mm5, [eax + ebx*4]        /* move shX/shY to mm5 and shZ to mm6. */
+       movd  mm6, [eax + ebx*4 + 8]
+       movq  mm0, mm5
+       movq  mm1, mm5
+       movq  mm2, mm6
+       punpckldq mm0,mm0               /* also expand shX,Y,Z in mm0--mm2. */
+       punpckhdq mm1,mm1
+       punpckldq mm2,mm2               
+       
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx=ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */
+
+       pfadd mm5, [eax + ebx*4]        /* ix = shX + posX (and iy too) */
+       movd  mm7, [eax + ebx*4 + 8]    /* cant use direct memory add for 4 bytes (iz) */
+       mov   [esp + ii3], ebx          /* (use mm7 as temp. storage for iz.) */
+       pfadd mm6, mm7
+       movq  [esp + ixO], mm5  
+       movq  [esp + izO], mm6
+
+       movd  mm3, [eax + ebx*4 + 12]
+       movd  mm4, [eax + ebx*4 + 16]
+       movd  mm5, [eax + ebx*4 + 20]
+       punpckldq  mm3, [eax + ebx*4 + 24]
+       punpckldq  mm4, [eax + ebx*4 + 28]
+       punpckldq  mm5, [eax + ebx*4 + 32] /* coords of H1 in low mm3-mm5, H2 in high */
+       
+       pfadd mm0, mm3
+       pfadd mm1, mm4
+       pfadd mm2, mm5          
+       movq [esp + ixH], mm0   
+       movq [esp + iyH], mm1   
+       movq [esp + izH], mm2   
+
+       /* clear vctot and i forces */
+       pxor  mm7,mm7
+       movq  [esp + vctot], mm7
+       movq  [esp + vnbtot], mm7
+       movq  [esp + fixO],  mm7
+       movq  [esp + fizO],  mm7
+       movq  [esp + fixH],  mm7
+       movq  [esp + fiyH],  mm7
+       movq  [esp + fizH],  mm7
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   [esp + innerk], edx        
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+.i3330_inner_loop:
+       /* a single j particle iteration here - compare with the unrolled code for comments. */
+       mov   eax, [esp + innerjjnr]
+       mov   eax, [eax]        /* eax=jnr offset */
+        add   [esp + innerjjnr],  4 /* advance pointer */
+
+       lea   eax, [eax + eax*2]
+
+       movq  mm0, [esi + eax*4]
+       movd  mm1, [esi + eax*4 + 8]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm0
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */ OO
+       pfmul mm0, mm1          /* mm0=rsq */ OO
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOO] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOO] /* fijC=qq*FF */
+
+       /* update vctot directly, use mm3 for fscal sum. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       movq mm3, mm7
+
+       /* dispersion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 16]
+       movd mm5, [edx + ecx*4 + 20]
+       movd mm6, [edx + ecx*4 + 24]
+       movd mm7, [edx + ecx*4 + 28]
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */  
+
+       movq mm4, [esp + c6]
+       pfmul mm7, mm4  /* fijD */
+       pfmul mm5, mm4  /* vnb6 */           
+       pfadd mm3, mm7  /* add to fscal */ 
+
+       /* update vnbtot to release mm5! */
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+
+       /* repulsion table */
+       /* load all the table values we need */
+       movd mm4, [edx + ecx*4 + 32]
+       movd mm5, [edx + ecx*4 + 36]
+       movd mm6, [edx + ecx*4 + 40]
+       movd mm7, [edx + ecx*4 + 44]
+
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       movq mm6, [esp + c12]
+       pfmul mm7, mm6  /* fijR */
+       pfmul mm5, mm6  /* vnb12 */
+       pfadd mm3, mm7  /* total fscal fijC+fijD+fijR */
+
+       /* change sign of fscal and multiply with rinv */ 
+        pxor mm0,mm0
+       pfsubr mm3, mm0 
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+       
+       /* update vnbtot */ 
+       pfadd mm5, [esp + vnbtot]      /* add the earlier value */
+       movq [esp + vnbtot], mm5       /* store the sum */      
+       
+       /* Ready with the oxygen - potential is updated, fscal is in mm3.
+        * time for hydrogens!
+         */
+       
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+       
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4]
+       movd mm3,  [edi + eax*4 + 8]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4], mm2
+       movd [edi + eax*4 +8], mm3
+
+       /* interactions with j H1 */
+
+       movq  mm0, [esi + eax*4 + 12]
+       movd  mm1, [esi + eax*4 + 20]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+       
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       pfmul mm0, mm1          /* mm0=rsq */ 
+       
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+       /* update vctot directly, force is moved to mm3. */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       pxor mm3, mm3
+       pfsub mm3, mm7
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1        /* mm3 is total fscal (for the oxygen) now */
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */          
+
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4
+       
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 12]
+       movd mm3,  [edi + eax*4 + 20]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 12], mm2
+       movd [edi + eax*4 + 20], mm3
+
+       /* interactions with j H2 */
+       movq  mm0, [esi + eax*4 + 24]
+       movd  mm1, [esi + eax*4 + 32]
+       /* copy & expand to mm2-mm4 for the H interactions */
+       movq  mm2, mm0
+       movq  mm3, mm0
+       movq  mm4, mm1
+       punpckldq mm2,mm2
+       punpckhdq mm3,mm3
+       punpckldq mm4,mm4
+
+       pfsubr mm0, [esp + ixO]
+       pfsubr mm1, [esp + izO]
+               
+       movq  [esp + dxO], mm0
+       pfmul mm0,mm0
+       movd  [esp + dzO], mm1  
+       pfmul mm1,mm1
+       pfacc mm0, mm1
+       pfadd mm0, mm1          /* mm0=rsqO */
+       
+       punpckldq mm2, mm2
+       punpckldq mm3, mm3
+       punpckldq mm4, mm4  /* mm2-mm4 is jx-jz */
+       pfsubr mm2, [esp + ixH]
+       pfsubr mm3, [esp + iyH]
+       pfsubr mm4, [esp + izH] /* mm2-mm4 is dxH-dzH */
+       
+       movq [esp + dxH], mm2
+       movq [esp + dyH], mm3
+       movq [esp + dzH], mm4
+       pfmul mm2,mm2
+       pfmul mm3,mm3
+       pfmul mm4,mm4
+
+       pfadd mm3,mm2
+       pfadd mm3,mm4           /* mm3=rsqH */
+       movq [esp + tmprsqH], mm3
+
+        pfrsqrt mm1,mm0
+
+        movq mm2,mm1
+        pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       pfmul mm0, mm1
+
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movd [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqOH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqOH] /* fijC=qq*FF */
+
+       /* update vctot directly, use mm3 for fscal sum */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       pxor mm3,mm3
+       pfsub mm3, mm7
+       pfmul mm3, [esp + tsc]
+       pfmul mm3, mm1         /* mm3 is total fscal (for the oxygen) now */    
+
+       movq mm0, [esp + tmprsqH]
+
+       pfrsqrt mm1, mm0
+       pswapd mm0,mm0
+       pfrsqrt mm2, mm0
+       pswapd mm0,mm0
+       punpckldq mm1,mm2       /* seeds are in mm1 now, and rsq in mm0. */
+
+       movq mm2, mm1
+       pfmul mm1,mm1
+        pfrsqit1 mm1,mm0                               
+        pfrcpit2 mm1,mm2       /* mm1=invsqrt */
+       
+       pfmul mm0,mm1           /* mm0=r */
+       pfmul mm0, [esp + tsc]
+       pf2iw mm4, mm0
+       movq [esp + n1], mm4
+       pi2fd mm4,mm4
+       pfsub mm0, mm4                   /* now mm0 is eps and mm4 n0 */
+       movq  mm2, mm0
+       pfmul mm2, mm2          /* mm0 is eps, mm2 eps2 */
+       
+       /* coulomb table */
+       mov edx, [ebp + VFtab]
+       mov ecx, [esp + n1]
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       /* load all values we need */
+       movd mm4, [edx + ecx*4]
+       movd mm5, [edx + ecx*4 + 4]
+       movd mm6, [edx + ecx*4 + 8]
+       movd mm7, [edx + ecx*4 + 12]
+       mov ecx, [esp + n1 + 4]/* mm5 = Fp */
+       lea ecx, [ecx + ecx*2]
+       shl ecx, 2
+       punpckldq mm4, [edx + ecx*4]
+       punpckldq mm5, [edx + ecx*4 + 4]
+       punpckldq mm6, [edx + ecx*4 + 8]
+       punpckldq mm7, [edx + ecx*4 + 12]
+
+       
+       pfmul mm6, mm0  /* mm6 = Geps */                
+       pfmul mm7, mm2  /* mm7 = Heps2 */
+       
+       pfadd mm5, mm6
+       pfadd mm5, mm7  /* mm5 = Fp */
+
+       pfmul mm7, [esp + two]  /* two*Heps2 */
+       pfadd mm7, mm6
+       pfadd mm7, mm5  /* mm7=FF */
+
+       pfmul mm5, mm0  /* mm5=eps*Fp */
+       pfadd mm5, mm4  /*  mm5= VV */
+
+       pfmul mm5, [esp + qqHH] /* vcoul=qq*VV */
+       pfmul mm7, [esp + qqHH] /* fijC=qq*FF */
+       /* update vctot */
+       pfadd mm5, [esp + vctot]
+       movq [esp + vctot], mm5
+       
+       /* change sign of fijC and multiply by rinv */
+        pxor mm4,mm4
+       pfsub mm4, mm7  
+       pfmul mm4, [esp + tsc]
+       pfmul mm4, mm1        /* mm4 is total fscal (for the hydrogens) now */  
+
+       /* spread oxygen fscalar to both positions */
+       punpckldq mm3,mm3
+       /* calc vectorial force for O */
+       movq mm0,  [esp + dxO]
+       movd mm1,  [esp + dzO]
+       pfmul mm0, mm3
+       pfmul mm1, mm3
+
+       /* calc vectorial force for H's */
+       movq mm5, [esp + dxH]
+       movq mm6, [esp + dyH]
+       movq mm7, [esp + dzH]
+       pfmul mm5, mm4
+       pfmul mm6, mm4
+       pfmul mm7, mm4
+       
+       /* update iO particle force */
+       movq mm2,  [esp + fixO]
+       movd mm3,  [esp + fizO]
+       pfadd mm2, mm0
+       pfadd mm3, mm1
+       movq [esp + fixO], mm2
+       movd [esp + fizO], mm3
+
+       /* update iH forces */
+       movq mm2, [esp + fixH]
+       movq mm3, [esp + fiyH]
+       movq mm4, [esp + fizH]
+       pfadd mm2, mm5
+       pfadd mm3, mm6
+       pfadd mm4, mm7
+       movq [esp + fixH], mm2
+       movq [esp + fiyH], mm3
+       movq [esp + fizH], mm4  
+
+       /* pack j forces from H in the same form as the oxygen force. */
+       pfacc mm5, mm6          /* mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2) */
+       pfacc mm7, mm7          /* mm7(l)=fjz(H1+H2) */
+       
+       pfadd mm0, mm5          /* add up total force on j particle. */ 
+       pfadd mm1, mm7
+
+       /* update j particle force */
+       movq mm2,  [edi + eax*4 + 24]
+       movd mm3,  [edi + eax*4 + 32]
+       pfsub mm2, mm0
+       pfsub mm3, mm1
+       movq [edi + eax*4 + 24], mm2
+       movd [edi + eax*4 + 32], mm3
+       
+       /*  done  - one more? */
+       dec dword ptr [esp + innerk]
+       jz  .i3330_updateouterdata
+       jmp .i3330_inner_loop   
+.i3330_updateouterdata:        
+       mov   ecx, [esp + ii3]
+
+       movq  mm6, [edi + ecx*4]       /* increment iO force */ 
+       movd  mm7, [edi + ecx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       movq  [edi + ecx*4],    mm6
+       movd  [edi + ecx*4 +8], mm7
+
+       movq  mm0, [esp + fixH]
+       movq  mm3, [esp + fiyH]
+       movq  mm1, [esp + fizH]
+       movq  mm2, mm0
+       punpckldq mm0, mm3      /* mm0(l)=fxH1, mm0(h)=fyH1 */
+       punpckhdq mm2, mm3      /* mm2(l)=fxH2, mm2(h)=fyH2 */
+       movq mm3, mm1
+       pswapd mm3,mm3          
+       /* mm1 is fzH1 */
+       /* mm3 is fzH2 */
+
+       movq  mm6, [edi + ecx*4 + 12]       /* increment iH1 force */ 
+       movd  mm7, [edi + ecx*4 + 20]   
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       movq  [edi + ecx*4 + 12],  mm6
+       movd  [edi + ecx*4 + 20],  mm7
+       
+       movq  mm6, [edi + ecx*4 + 24]       /* increment iH2 force */
+       movd  mm7, [edi + ecx*4 + 32]   
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [edi + ecx*4 + 24],  mm6
+       movd  [edi + ecx*4 + 32],  mm7
+
+       
+       mov   ebx, [ebp + fshift]    /* increment fshift force */
+       mov   edx, [esp + is3]
+
+       movq  mm6, [ebx + edx*4]        
+       movd  mm7, [ebx + edx*4 + 8]    
+       pfadd mm6, [esp + fixO]
+       pfadd mm7, [esp + fizO]
+       pfadd mm6, mm0
+       pfadd mm7, mm1
+       pfadd mm6, mm2
+       pfadd mm7, mm3
+       movq  [ebx + edx*4],     mm6
+       movd  [ebx + edx*4 + 8], mm7
+       
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       movq  mm7, [esp + vctot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vc]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vc[gid] */
+
+       movq  mm7, [esp + vnbtot]     
+       pfacc mm7,mm7                 /* get and sum the two parts of total potential */
+
+       mov   eax, [ebp + Vnb]
+       movd  mm6, [eax + edx*4] 
+       pfadd mm6, mm7
+       movd  [eax + edx*4], mm6              /* increment vnbtot[gid] */
+       /* finish if last */
+       dec dword ptr [ebp + nri]
+       jz  .i3330_end
+       /* not last, iterate once more! */
+       jmp .i3330_outer
+.i3330_end:
+       femms
+       add esp, 212
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+ 
diff --git a/src/gmxlib/x86_3dnow.asm b/src/gmxlib/x86_3dnow.asm

deleted file mode 100644 (file)

index 0800d3b..0000000
--- a/src/gmxlib/x86_3dnow.asm
+++ /dev/null
@@ -1,15902 +0,0 @@
-;;
-;;                This source code is part of
-;; 
-;;                 G   R   O   M   A   C   S
-;; 
-;;          GROningen MAchine for Chemical Simulations
-;; 
-;;                        VERSION 3.0
-;; 
-;; Copyright (c) 1991-2001
-;; BIOSON Research Institute, Dept. of Biophysical Chemistry
-;; University of Groningen, The Netherlands
-;; 
-;; This program is free software; you can redistribute it and/or
-;; modify it under the terms of the GNU General Public License
-;; as published by the Free Software Foundation; either version 2
-;; of the License, or (at your option) any later version.
-;; 
-;; If you want to redistribute modifications, please consider that
-;; scientific software is very special. Version control is crucial -
-;; bugs must be traceable. We will be happy to consider code for
-;; inclusion in the official distribution, but derived work must not
-;; be called official GROMACS. Details are found in the README & COPYING
-;; files - if they are missing, get the official version at www.gromacs.org.
-;; 
-;; To help us fund GROMACS development, we humbly ask that you cite
-;; the papers on the package - you can find them in the top README file.
-;; 
-;; Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org .
-;; 
-;; And Hey:
-;; GROup of MAchos and Cynical Suckers
-       
-
-; NASM macro set to make interfacing to 32-bit programs easier -*- nasm -*-
-%imacro proc 1                  ; begin a procedure definition
-%push proc
-          global %1
-%1:       push ebp
-          mov ebp,esp
-%assign %$arg 8
-%define %$procname %1
-%endmacro
-
-
-
-%imacro arg 0-1 4               ; used with the argument name as a label
-%00       equ %$arg
-%assign %$arg %1+%$arg
-%endmacro
-
-
-
-%imacro endproc 0
-%ifnctx proc
-%error Mismatched `endproc'/`proc'
-
-%else
-          leave
-          ret
-__end_%$procname:               ; useful for calculating function size
-
-%pop
-%endif
-%endmacro
-
-       ;;      This file contains a subset of the gromacs innerloops
-       ;;      manually written in assembly to optimize performance
-       ;;      on AMD extended 3DNow-enabled processors like Athlon 
-       ;;      and later generations.
-       ;;      Erik Lindahl, 2000, erik@theophys.kth.se
-       
-segment .data
-mm_two
-       dd 2.0
-       dd 2.0
-mm_six
-       dd 6.0
-       dd 6.0
-mm_twelve
-       dd 12.0
-       dd 12.0
-
-
-segment .text
-
-
-       global check3dnow       ;  tries to issue a simple 3DNOW instruction    
-check3dnow:    
-       femms
-       pfmul mm0,mm0
-       femms
-       ret
-       
-       
-       global vecrecip_3dnow
-vecrecip_3dnow
-       push ebp
-       mov ebp,esp     
-       push eax
-       push ebx
-       push ecx
-       push edx
-
-       mov eax, [ebp + 8]
-       mov ebx, [ebp + 12]     
-       mov ecx, [ebp + 16]
-        mov edx, ecx
-        shr ecx, 2 
-        jecxz .tail
-        emms   
-.mainloop:     
-        movq mm0,[eax]
-       add eax, byte 8
-        pfrcp mm1,mm0
-       movq mm4,[eax]
-       pswapd mm0,mm0
-       add eax, byte 8
-        pfrcp mm2,mm0
-       pswapd mm0,mm0
-        pfrcp mm5,mm4
-       pswapd mm4,mm4  
-       punpckldq mm1,mm2
-       pfrcp mm6,mm4
-       pswapd mm4,mm4
-       pfrcpit1 mm0,mm1
-       punpckldq mm5,mm6       
-       pfrcpit2 mm0,mm1
-        movq [ebx],mm0
-       pfrcpit1 mm4,mm5
-       add ebx, byte 8
-       pfrcpit2 mm4,mm5        
-        movq [ebx],mm4
-       add ebx, byte 8 
-        dec ecx
-        jecxz .tail
-        jmp short .mainloop
-.tail:
-        mov ecx,edx
-        and ecx,3
-        jecxz .end
-.tailloop:     
-        movd mm0,[eax]
-       add eax, byte 4
-        pfrcp mm1,mm0
-        pfrcpit1 mm0,mm1
-        pfrcpit2 mm0,mm1
-        movd [ebx],mm0 
-       add ebx, byte 4
-       dec ecx
-       jecxz .end
-       jmp short .tailloop
-.end:  
-       emms
-       pop edx
-       pop ecx
-       pop ebx
-       pop eax
-       leave
-       ret
-
-               
-segment .text
-
-       global vecinvsqrt_3dnow
-vecinvsqrt_3dnow
-       push ebp
-       mov ebp,esp     
-       push eax
-       push ebx
-       push ecx
-       push edx
-
-       mov eax, [ebp + 8]
-       mov ebx, [ebp + 12]     
-       mov ecx, [ebp + 16]
-        mov edx, ecx
-        shr ecx, 2 
-        jecxz .tail
-        emms   
-.mainloop:     
-        movq mm0,[eax]
-       add eax, byte 8
-        pfrsqrt mm1,mm0
-       movq mm4,[eax]
-       pswapd mm0,mm0
-       add eax, byte 8
-        pfrsqrt mm2,mm0
-       pswapd mm0,mm0
-        pfrsqrt mm5,mm4
-       pswapd mm4,mm4  
-       punpckldq mm1,mm2
-       pfrsqrt mm6,mm4
-       movq mm3,mm1
-       pswapd mm4,mm4
-       pfmul mm1,mm1
-       punpckldq mm5,mm6       
-       pfrsqit1 mm1,mm0
-       movq mm7,mm5    
-       pfrcpit2 mm1,mm3
-       pfmul mm5,mm5
-        movq [ebx],mm1
-       pfrsqit1 mm5,mm4
-       add ebx, byte 8
-       pfrcpit2 mm5,mm7        
-        movq [ebx],mm5
-       add ebx, byte 8 
-        dec ecx
-        jecxz .tail
-        jmp short .mainloop
-.tail:
-        mov ecx,edx
-        and ecx,3
-        jecxz .end
-.tailloop:     
-        movd mm0,[eax]
-       add eax, byte 4
-        pfrsqrt mm1,mm0
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0
-        pfrcpit2 mm1,mm2
-        movd [ebx],mm1         
-       add ebx, byte 4
-       dec ecx
-       jecxz .end
-       jmp short .tailloop
-.end:  
-       emms
-       pop edx
-       pop ecx
-       pop ebx
-       pop eax
-       leave
-       ret
-       
-
-
-proc inl0100_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ix          equ     8
-.iy          equ    12
-.iz          equ    16
-.vnbtot      equ    20           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    28           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    36           ; repeated (64bit) to fill 3dnow reg
-.six         equ    44           ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    52           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    60
-.innerjjnr   equ    64
-.innerk      equ    68         
-.fix         equ    72
-.fiy         equ    76
-.fiz        equ    80
-.dx1        equ    84
-.dy1        equ    88
-.dz1        equ    92
-.dx2        equ    96
-.dy2        equ   100
-.dz2        equ   104                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 108            ;   local stack space
-       femms
-       ; move data to local stack 
-       movq  mm0, [mm_six]
-       movq  mm1, [mm_twelve]
-       movq  [esp + .six], mm0
-       movq  [esp + .twelve], mm1
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm0, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm3, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx
-       pfadd mm1, mm3
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-                               
-       ;; clear total potential and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fix],    mm7
-       movd  [esp + .fiz],    mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 2
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best      
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrcp mm0, mm4                  ; lookup reciprocal seed 
-        pfrcp mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-                                       ; amd 3dnow N-R iteration to get full precision.
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       
-       ;; mm4 now contains invsq,
-       ;; do potential and fscal
-       movq  mm0, mm4
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5,mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner: 
-       and [esp + .innerk], dword 1
-       jnz  .single_inner
-       jmp  .updateouterdata           
-.single_inner:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm4=rsq
-       
-        pfrcp mm0,mm4
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       ;  mm4=invsq 
-       ;;  calculate potentials and scalar force
-       movq  mm0, mm4
-
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5, mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 108
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-               
-               
-proc inl0110_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg                             
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.shX        equ     8
-.shY         equ    12 
-.shZ        equ    16  
-.ix          equ    20
-.iy          equ    24
-.iz          equ    28 
-.vnbtot      equ    32           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    40           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    48           ; repeated (64bit) to fill 3dnow reg
-.six         equ    56           ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    64           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    72  
-.innerjjnr0  equ    76
-.innerk0     equ    80         
-.innerjjnr   equ    84
-.innerk      equ    88 
-.fix         equ    92
-.fiy         equ    96
-.fiz        equ    100
-.dx1        equ    104
-.dy1        equ    108
-.dz1        equ    112
-.dx2        equ    116
-.dy2        equ    120
-.dz2        equ    124                                                         
-.nsvdwc      equ    128
-.nscoul      equ    132
-.nsvdw       equ    136
-.solnr      equ    140         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 144            ;  local stack space
-       femms
-       movq  mm0, [mm_six]
-       movq  mm1, [mm_twelve]
-       movq  [esp + .six],    mm0
-       movq  [esp + .twelve], mm1
-       ;; assume we have at least one i particle - start directly              
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-       movq  [esp + .shX], mm0
-       movd  [esp + .shZ], mm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       pxor  mm7,mm7
-       movq  [esp + .vnbtot], mm7
-       mov   [esp + .solnr],  ebx
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]
-       
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testvdw
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best      
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrcp mm0, mm4                  ; lookup reciprocal seed 
-        pfrcp mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-                                       ; amd 3dnow N-R iteration to get full precision.
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       
-       ;; mm4 now contains invsq,
-       ;; do potential and fscal
-       movq  mm0, mm4
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5,mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_vdwc_inner
-       jmp  .updateouterdata_vdwc              
-.single_vdwc_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm4=rsq
-       
-        pfrcp mm0,mm4
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       ;  mm4=invsq 
-       ;;  calculate potentials and scalar force
-       movq  mm0, mm4
-
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5, mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdwc: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testvdw
-       jmp .mno_vdwc
-.testvdw
-       mov  ebx,  [esp + .nscoul]
-       add  [esp + .solnr], dword ebx
-
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best      
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrcp mm0, mm4                  ; lookup reciprocal seed 
-        pfrcp mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-                                       ; amd 3dnow N-R iteration to get full precision.
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       
-       ;; mm4 now contains invsq,
-       ;; do potential and fscal
-       movq  mm0, mm4
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5,mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:     
-       and [esp + .innerk], dword 1
-       jnz  .single_vdw_inner
-       jmp  .updateouterdata_vdw               
-.single_vdw_inner:     
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm4=rsq
-       
-        pfrcp mm0,mm4
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       ;  mm4=invsq 
-       ;;  calculate potentials and scalar force
-       movq  mm0, mm4
-
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5, mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdw:  
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-       
-.last_mno:     
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 144
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl0300_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ix          equ     8
-.iy          equ    12
-.iz          equ    16
-.vnbtot      equ    20           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    28           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    36           ; repeated (64bit) to fill 3dnow reg
-.two         equ    44           ; repeated (64bit) to fill 3dnow reg
-.n1          equ    52           ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    60           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    68
-.innerjjnr   equ    72
-.innerk      equ    76         
-.fix         equ    80
-.fiy         equ    84
-.fiz        equ    88
-.dx1        equ    92
-.dy1        equ    96
-.dz1        equ    100
-.dx2        equ    104
-.dy2        equ    108
-.dz2        equ    112                                         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 116            ;   local stack space
-       femms
-       ; move data to local stack 
-       movq  mm0, [mm_two]
-       movd  mm3, [ebp + %$tabscale]
-       movq  [esp + .two],    mm0
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3    
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm0, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm3, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx
-       pfadd mm1, mm3
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-                               
-       ;; clear total potential and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fix],    mm7
-       movd  [esp + .fiz],    mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 2
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       ; dispersion table
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 3
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 3
-       punpckldq mm4, [edx + ecx*4 + 16]
-       punpckldq mm5, [edx + ecx*4 + 20]
-       punpckldq mm6, [edx + ecx*4 + 24]
-       punpckldq mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm1, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner: 
-       and [esp + .innerk], dword 1
-       jnz  .single_inner
-       jmp  .updateouterdata           
-.single_inner:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 116
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-                       
-       
-proc inl0310_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.shX        equ     8
-.shY         equ    12 
-.shZ        equ    16  
-.ix          equ    20
-.iy          equ    24
-.iz          equ    28 
-.vnbtot      equ    32           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    40           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    48           ; repeated (64bit) to fill 3dnow reg
-.two         equ    56           ; repeated (64bit) to fill 3dnow reg
-.n1          equ    64           ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    72           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    80  
-.innerjjnr0  equ    84
-.innerk0     equ    88         
-.innerjjnr   equ    92
-.innerk      equ    96 
-.fix         equ    100
-.fiy         equ    104
-.fiz        equ    108
-.dx1        equ    112
-.dy1        equ    116
-.dz1        equ    120
-.dx2        equ    124
-.dy2        equ    128
-.dz2        equ    132                                                         
-.nsvdwc      equ    136
-.nscoul      equ    140
-.nsvdw       equ    144
-.solnr      equ    148         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 152            ;  local stack space
-       femms
-       movq  mm0, [mm_two]
-       movd  mm3, [ebp + %$tabscale]
-       movq  [esp + .two],    mm0
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3    
-       
-       ;; assume we have at least one i particle - start directly              
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-       movq  [esp + .shX], mm0
-       movd  [esp + .shZ], mm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       pxor  mm7,mm7
-       movq  [esp + .vnbtot], mm7
-       mov   [esp + .solnr],  ebx
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]
-       
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testvdw
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       ; dispersion table
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 3
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 3
-       punpckldq mm4, [edx + ecx*4 + 16]
-       punpckldq mm5, [edx + ecx*4 + 20]
-       punpckldq mm6, [edx + ecx*4 + 24]
-       punpckldq mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm1, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-               
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_vdwc_inner
-       jmp  .updateouterdata_vdwc              
-.single_vdwc_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdwc: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testvdw
-       jmp .mno_vdwc
-.testvdw
-       mov  ebx,  [esp + .nscoul]
-       add  [esp + .solnr], dword ebx
-
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       ; dispersion table
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 3
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 3
-       punpckldq mm4, [edx + ecx*4 + 16]
-       punpckldq mm5, [edx + ecx*4 + 20]
-       punpckldq mm6, [edx + ecx*4 + 24]
-       punpckldq mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm1, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:     
-       and [esp + .innerk], dword 1
-       jnz  .single_vdw_inner
-       jmp  .updateouterdata_vdw               
-.single_vdw_inner:     
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 3
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdw:  
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-       
-.last_mno:     
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 152
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-proc inl1000_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ix          equ     8
-.iy          equ    12
-.iz          equ    16
-.iq          equ    20         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    28           ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    36
-.innerk      equ    40         
-.fix         equ    44
-.fiy         equ    48
-.fiz        equ    52
-.dx1        equ    56
-.dy1        equ    60
-.dz1        equ    64
-.dx2        equ    68
-.dy2        equ    72
-.dz2        equ    76                                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 80             ;  80 bytes local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm0, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm3, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx
-       pfadd mm1, mm3
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-                               
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 2
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-       movd mm7, [ecx + ebx*4]          ; charge[jnr2] 
-       punpckldq mm3,mm7                ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       movq mm1,mm0
-       pfmul mm0,mm0
-       ;; mm0 now contains invsq, and mm1 invsqrt
-       ;; do potential and fscal
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-       
-       pfmul mm3,mm1           ;  6 has both vcoul
-       pfmul mm0,mm3           ;  0 has both fscal 
-
-       ;; update vctot
-
-       pfadd mm3, [esp + .vctot]      ; add the earlier value 
-       movq [esp + .vctot], mm3       ;  store the sum
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner: 
-       and [esp + .innerk], dword 1
-       jnz  .single_inner
-       jmp  .updateouterdata           
-.single_inner: 
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm6, [esp + .iq]
-       movd mm7, [ecx + eax*4]
-       pfmul mm6, mm7          ;  mm6=qq
-       
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm2, [esi + eax*4]
-       movd  mm3, [esi + eax*4 + 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq  [esp + .dx1], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dz1], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfacc mm0, mm1          ;  mm0=rsq
-       
-        pfrsqrt mm1,mm0
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-       ;;  update vctot
-       movq mm5, [esp + .vctot]
-       pfadd mm5, mm6
-       movq [esp + .vctot], mm5
-       ;;  spread fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dx1]
-       movd mm1,  [esp + .dz1]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-       ;; update i particle force
-       movq mm2,  [esp + .fix]
-       movd mm3,  [esp + .fiz]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fix], mm2
-       movd [esp + .fiz], mm3
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax *4+ 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-       ;;  done!
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 80
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-proc inl1010_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.shX        equ     8
-.shY         equ    12 
-.shZ        equ    16  
-.ix          equ    20
-.iy          equ    24
-.iz          equ    28 
-.iq          equ    32         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    40           ; repeated (64bit) to fill 3dnow reg
-.innerjjnr0  equ    48
-.innerk0     equ    52         
-.innerjjnr   equ    56
-.innerk      equ    60         
-.fix         equ    64
-.fiy         equ    68
-.fiz        equ    72
-.dx1        equ    76
-.dy1        equ    80
-.dz1        equ    84
-.dx2        equ    88
-.dy2        equ    92
-.dz2        equ    96                                                                  
-.nscoul      equ    100
-.solnr      equ    104         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 108            ;  local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-       add   [ebp + %$nsatoms], dword 8
-
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-       movq  [esp + .shX], mm0
-       movd  [esp + .shZ], mm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       mov   ecx, [eax]
-       add   [ebp + %$nsatoms], dword 12
-       mov   [esp + .nscoul], ecx
-               
-       ;; clear potential
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       mov   [esp + .solnr], ebx
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]
-
-       mov   ecx, [esp + .nscoul]
-       cmp   ecx, dword 0
-       jnz   .mno_coul
-       jmp   .last_mno
-.mno_coul:                             
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-.unroll_coul_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-       movd mm7, [ecx + ebx*4]          ; charge[jnr2] 
-       punpckldq mm3,mm7                ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       movq mm1,mm0
-       pfmul mm0,mm0
-       ;; mm0 now contains invsq, and mm1 invsqrt
-       ;; do potential and fscal
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-       
-       pfmul mm3,mm1           ;  6 has both vcoul
-       pfmul mm0,mm3           ;  0 has both fscal 
-
-       ;; update vctot
-
-       pfadd mm3, [esp + .vctot]      ; add the earlier value 
-       movq [esp + .vctot], mm3       ;  store the sum
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_coul_inner
-       jmp  .updateouterdata_coul              
-.single_coul_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm6, [esp + .iq]
-       movd mm7, [ecx + eax*4]
-       pfmul mm6, mm7          ;  mm6=qq
-       
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm2, [esi + eax*4]
-       movd  mm3, [esi + eax*4 + 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq  [esp + .dx1], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dz1], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfacc mm0, mm1          ;  mm0=rsq
-       
-        pfrsqrt mm1,mm0
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-       ;;  update vctot
-       movq mm5, [esp + .vctot]
-       pfadd mm5, mm6
-       movq [esp + .vctot], mm5
-       ;;  spread fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dx1]
-       movd mm1,  [esp + .dz1]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-       ;; update i particle force
-       movq mm2,  [esp + .fix]
-       movd mm3,  [esp + .fiz]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fix], mm2
-       movd [esp + .fiz], mm3
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax *4+ 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-       ;;  done!
-.updateouterdata_coul: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .last_mno
-       jmp .mno_coul
-.last_mno:     
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 108
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-                       
-proc inl1020_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.iqO         equ    44         ; repeated (64bit) to fill 3dnow reg
-.iqH         equ    52         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    60          ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    68
-.innerk      equ    72         
-.fixO        equ    76 
-.fiyO        equ    80
-.fizO        equ    84
-.fixH        equ    88          ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    96          ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    104         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    112
-.dyO        equ    116
-.dzO        equ    120
-.dxH        equ    124         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    132         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    140         ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 148            ;   local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0]
-       pfmul mm2, mm1          
-       movq  [esp + .iqO], mm2         ;  iqO = facel*charge[ii]
-       
-       movd  mm2, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1]
-       pfmul mm2, mm1
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iqH], mm2         ;  iqH = facel*charge[ii0+1]
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-                                       
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .fixO],   mm7
-       movd  [esp + .fizO],   mm7
-       movq  [esp + .fixH],   mm7
-       movq  [esp + .fiyH],   mm7
-       movq  [esp + .fizH],   mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:   
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-       ;prefetch [ecx + 16]       ; prefetch data - trial and error says 16 is best
-
-       mov ecx, [ebp + %$charge]
-       movd mm7, [ecx + eax*4]
-       punpckldq mm7,mm7
-       movq mm6,mm7
-       pfmul mm6, [esp + .iqO]
-       pfmul mm7, [esp + .iqH] ;  mm6=qqO, mm7=qqH
-       
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-       
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 148
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-proc inl1030_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                                             
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.qqOO        equ    44         ; repeated (64bit) to fill 3dnow reg
-.qqOH        equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqHH        equ    60         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    68          ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    76
-.innerk      equ    80         
-.fixO        equ    84 
-.fiyO        equ    88
-.fizO        equ    92
-.fixH        equ    96          ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    104         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    112         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    120
-.dyO        equ    124
-.dzO        equ    128
-.dxH        equ    132         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    140         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    148         ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 156            ;   local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]      ;  mm1=facel
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0] (O)
-       movd  mm3, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1] (H)
-       movq  mm4, mm2  
-       pfmul mm4, mm1
-       movq  mm6, mm3
-       pfmul mm6, mm1
-       movq  mm5, mm4
-       pfmul mm4, mm2                  ; mm4=qqOO*facel
-       pfmul mm5, mm3                  ; mm5=qqOH*facel
-       pfmul mm6, mm3                  ; mm6=qqHH*facel
-       punpckldq mm5,mm5               ;  spread to both halves
-       punpckldq mm6,mm6               ;  spread to both halves
-       movq  [esp + .qqOO], mm4
-       movq  [esp + .qqOH], mm5
-       movq  [esp + .qqHH], mm6
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .fixO],  mm7
-       movq  [esp + .fizO],  mm7
-       movq  [esp + .fixH],  mm7
-       movq  [esp + .fiyH],  mm7
-       movq  [esp + .fizH],  mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-
-       movd  mm6, [esp + .qqOO]
-       movq  mm7, [esp + .qqOH]
-
-       lea   eax, [eax + eax*2]
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm0
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-
-       ; interactions with j H1.
-       movq  mm0, [esi + eax*4 + 12]
-       movd  mm1, [esi + eax*4 + 20]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       movd mm6, [esp + .qqOH]
-       movq mm7, [esp + .qqHH]
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 12]
-       movd mm3,  [edi + eax*4 + 20]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 12], mm2
-       movd [edi + eax*4 + 20], mm3
-
-       ; interactions with j H2
-       movq  mm0, [esi + eax*4 + 24]
-       movd  mm1, [esi + eax*4 + 32]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-
-       movd mm6, [esp + .qqOH]
-       movq mm7, [esp + .qqHH]
-
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4 
-
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 24]
-       movd mm3,  [edi + eax*4 + 32]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 24], mm2
-       movd [edi + eax*4 + 32], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop 
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 156
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-proc inl1100_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ix          equ     8
-.iy          equ    12
-.iz          equ    16
-.iq                 equ    20           ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    28           ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    36           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    44           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    52           ; repeated (64bit) to fill 3dnow reg
-.six         equ    60           ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    68           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    76
-.innerjjnr   equ    80
-.innerk      equ    84         
-.fix         equ    88
-.fiy         equ    92
-.fiz        equ    96
-.dx1        equ    100
-.dy1        equ    104
-.dz1        equ    108
-.dx2        equ    112
-.dy2        equ    116
-.dz2        equ    120                                         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 124            ;   local stack space
-       femms
-       ; move data to local stack 
-       movq  mm0, [mm_six]
-       movq  mm1, [mm_twelve]
-       movq  [esp + .six],    mm0
-       movq  [esp + .twelve], mm1
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm0, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm3, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx
-       pfadd mm1, mm3
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-                               
-       ;; clear total potential and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot],  mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fix],    mm7
-       movd  [esp + .fiz],    mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 2
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       movq mm1,mm0
-       pfmul mm0,mm0
-       ;; mm0 now contains invsq, and mm1 invsqrt
-       ;; do potential and fscal
-       movq mm4, mm0
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm3, mm1          ;  mm3 has vcoul for both interactions
-       movq  mm7, mm3          ;  use mm7 for sum to make fscal
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm7,mm4
-       pfadd mm7, mm5
-       pfmul mm0, mm7        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;; update vctot
-       pfadd mm3, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm3       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner: 
-       and [esp + .innerk], dword 1
-       jnz  .single_inner
-       jmp  .updateouterdata           
-.single_inner:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       movq  mm1, mm0
-       pfmul mm0, mm0          ;  mm0=invsq
-       ;;  calculate potentials and scalar force
-       movq mm4, mm0
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm3, mm1          ;  mm3 has vcoul for both interactions
-       movq  mm7, mm3          ;  use mm7 for sum to make fscal
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm7,mm4
-       pfadd mm7, mm5
-       pfmul mm0, mm7        ; mm0 is total fscal now
-
-       ;;  update vctot
-       pfadd mm3, [esp + .vctot]
-       movq [esp + .vctot], mm3
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 124
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-       
-
-
-
-proc inl1110_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg                             
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.shX        equ     8
-.shY         equ    12 
-.shZ        equ    16  
-.ix          equ    20
-.iy          equ    24
-.iz          equ    28 
-.iq          equ    32          ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    40           ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    48           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    56           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    64           ; repeated (64bit) to fill 3dnow reg
-.six         equ    72           ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    80           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    88  
-.innerjjnr0  equ    92
-.innerk0     equ    96         
-.innerjjnr   equ    100
-.innerk      equ    104        
-.fix         equ    108
-.fiy         equ    112
-.fiz        equ    116
-.dx1        equ    120
-.dy1        equ    124
-.dz1        equ    128
-.dx2        equ    132
-.dy2        equ    136
-.dz2        equ    140                                                         
-.nsvdwc      equ    144
-.nscoul      equ    148
-.nsvdw       equ    152
-.solnr      equ    156         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 160            ;  local stack space
-       femms
-       movq  mm0, [mm_six]
-       movq  mm1, [mm_twelve]
-       movq  [esp + .six],    mm0
-       movq  [esp + .twelve], mm1
-       ;; assume we have at least one i particle - start directly              
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-       movq  [esp + .shX], mm0
-       movd  [esp + .shZ], mm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       pxor  mm7,mm7
-       movq  [esp + .vctot],  mm7
-       movq  [esp + .vnbtot], mm7
-       mov   [esp + .solnr],  ebx
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]
-       
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testcoul
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       movq mm1,mm0
-       pfmul mm0,mm0
-       ;; mm0 now contains invsq, and mm1 invsqrt
-       ;; do potential and fscal
-       movq mm4, mm0
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm3, mm1          ;  mm3 has vcoul for both interactions
-       movq  mm7, mm3          ;  use mm7 for sum to make fscal
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm7,mm4
-       pfadd mm7, mm5
-       pfmul mm0, mm7        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;; update vctot
-       pfadd mm3, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm3       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_vdwc_inner
-       jmp  .updateouterdata_vdwc              
-.single_vdwc_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       movq  mm1, mm0
-       pfmul mm0, mm0          ;  mm0=invsq
-       ;;  calculate potentials and scalar force
-       movq mm4, mm0
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm3, mm1          ;  mm3 has vcoul for both interactions
-       movq  mm7, mm3          ;  use mm7 for sum to make fscal
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm7,mm4
-       pfadd mm7, mm5
-       pfmul mm0, mm7        ; mm0 is total fscal now
-
-       ;;  update vctot
-       pfadd mm3, [esp + .vctot]
-       movq [esp + .vctot], mm3
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdwc: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testcoul
-       jmp .mno_vdwc
-.testcoul
-       mov  ecx, [esp + .nscoul]
-       cmp  ecx, byte 0
-       jnz  .mno_coul
-       jmp  .testvdw
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-.unroll_coul_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-       movd mm7, [ecx + ebx*4]          ; charge[jnr2] 
-       punpckldq mm3,mm7                ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       movq mm1,mm0
-       pfmul mm0,mm0
-       ;; mm0 now contains invsq, and mm1 invsqrt
-       ;; do potential and fscal
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-       
-       pfmul mm3,mm1           ;  6 has both vcoul
-       pfmul mm0,mm3           ;  0 has both fscal 
-
-       ;; update vctot
-
-       pfadd mm3, [esp + .vctot]      ; add the earlier value 
-       movq [esp + .vctot], mm3       ;  store the sum
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_coul_inner
-       jmp  .updateouterdata_coul              
-.single_coul_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm6, [esp + .iq]
-       movd mm7, [ecx + eax*4]
-       pfmul mm6, mm7          ;  mm6=qq
-       
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm2, [esi + eax*4]
-       movd  mm3, [esi + eax*4 + 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq  [esp + .dx1], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dz1], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfacc mm0, mm1          ;  mm0=rsq
-       
-        pfrsqrt mm1,mm0
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-       ;;  update vctot
-       movq mm5, [esp + .vctot]
-       pfadd mm5, mm6
-       movq [esp + .vctot], mm5
-       ;;  spread fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dx1]
-       movd mm1,  [esp + .dz1]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-       ;; update i particle force
-       movq mm2,  [esp + .fix]
-       movd mm3,  [esp + .fiz]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fix], mm2
-       movd [esp + .fiz], mm3
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax *4+ 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-       ;;  done!
-.updateouterdata_coul: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .testvdw
-       jmp .mno_coul
-.testvdw
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best      
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       movq mm1,mm0
-       pfmul mm0,mm0
-       ;; mm0 now contains invsq, and mm1 invsqrt
-       ;; do potential and fscal
-       movq mm4, mm0
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       movq  mm7, mm5
-       pfsub mm7,mm4
-       pfmul mm0, mm7        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:     
-       and [esp + .innerk], dword 1
-       jnz  .single_vdw_inner
-       jmp  .updateouterdata_vdw               
-.single_vdw_inner:     
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       movq  mm1, mm0
-       pfmul mm0, mm0          ;  mm0=invsq
-       ;;  calculate potentials and scalar force
-       movq mm4, mm0
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       movq  mm7, mm5
-       pfsub mm7,mm4
-       pfmul mm0, mm7        ; mm0 is total fscal now
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdw:  
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-       
-.last_mno:     
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 160
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl1120_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg                             
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.iqO         equ    44         ; repeated (64bit) to fill 3dnow reg
-.iqH         equ    52         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    60          ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    68          ; repeated (64bit) to fill 3dnow reg
-.c6          equ    76          ; repeated (64bit) to fill 3dnow reg
-.c12         equ    84          ; repeated (64bit) to fill 3dnow reg
-.six         equ    92          ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    100         ; repeated (64bit) to fill 3dnow reg
-.ntia        equ    108         ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    116
-.innerk      equ    120        
-.fixO        equ    124
-.fiyO        equ    128
-.fizO        equ    132
-.fixH        equ    136         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    144         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    152         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    160
-.dyO        equ    164
-.dzO        equ    168
-.dxH        equ    172         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    180         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    188         ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 196            ;   local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0]
-       pfmul mm2, mm1          
-       movq  [esp + .iqO], mm2         ;  iqO = facel*charge[ii]
-       
-       movd  mm2, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1]
-       pfmul mm2, mm1
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iqH], mm2         ;  iqH = facel*charge[ii0+1]
-
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       mov   [esp + .ntia], ecx
-       
-       movq  mm3, [mm_six]
-       movq  mm4, [mm_twelve]
-       movq  [esp + .six],    mm3
-       movq  [esp + .twelve], mm4  
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-                                       
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fixO],   mm7
-       movd  [esp + .fizO],   mm7
-       movq  [esp + .fixH],   mm7
-       movq  [esp + .fiyH],   mm7
-       movq  [esp + .fizH],   mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:   
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-       ;prefetch [ecx + 16]       ; prefetch data - trial and error says 16 is best
-
-       mov ecx, [ebp + %$charge]
-       movd mm7, [ecx + eax*4]
-       punpckldq mm7,mm7
-       movq mm6,mm7
-       pfmul mm6, [esp + .iqO]
-       pfmul mm7, [esp + .iqH] ;  mm6=qqO, mm7=qqH
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr]
-       mov ecx, [ebp + %$nbfp]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [ecx + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [ecx + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5  
-       
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-
-       movq  mm0, mm4
-       pfmul mm0, mm4
-       pfmul mm0, mm4          ;  mm0=rinvsix
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm2=rintwelve
-       
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ;  mm6=vcoul
-       movq  mm1, mm6          ;  use mm1 for fscal sum
-
-       ;; LJ for the oxygen
-       pfmul mm0, [esp + .c6]   
-       pfmul mm2, [esp + .c12]  
-
-       ;; calc nb potential
-       movq mm5, mm2
-       pfsub mm5, mm0
-
-       ;; calc nb force
-       pfmul mm0, [esp + .six]
-       pfmul mm2, [esp + .twelve]
-       
-       ;; increment scalar force
-       pfsub mm1, mm0
-       pfadd mm1, mm2
-       pfmul mm4, mm1          ;  total scalar force on oxygen.
-       
-       ;;  update nb potential
-       pfadd mm5, [esp + .vnbtot]
-       movq [esp + .vnbtot], mm5
-       
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-       
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  same for Vnb.
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 196
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-       
-
-proc inl1130_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                                             
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg                             
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.qqOO        equ    44         ; repeated (64bit) to fill 3dnow reg
-.qqOH        equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqHH        equ    60         ; repeated (64bit) to fill 3dnow reg
-.c6          equ    68         ; repeated (64bit) to fill 3dnow reg
-.c12         equ    76         ; repeated (64bit) to fill 3dnow reg
-.six         equ    84         ; repeated (64bit) to fill 3dnow reg
-.twelve             equ    92          ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    100         ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    108         ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    116
-.innerk      equ    120        
-.fixO        equ    124
-.fiyO        equ    128
-.fizO        equ    132
-.fixH        equ    136         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    144         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    152         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    160
-.dyO        equ    164
-.dzO        equ    168
-.dxH        equ    172         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    180         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    188         ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 196            ;   local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]      ;  mm1=facel
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0] (O)
-       movd  mm3, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1] (H)
-       movq  mm4, mm2  
-       pfmul mm4, mm1
-       movq  mm6, mm3
-       pfmul mm6, mm1
-       movq  mm5, mm4
-       pfmul mm4, mm2                  ; mm4=qqOO*facel
-       pfmul mm5, mm3                  ; mm5=qqOH*facel
-       pfmul mm6, mm3                  ; mm6=qqHH*facel
-       punpckldq mm5,mm5               ;  spread to both halves
-       punpckldq mm6,mm6               ;  spread to both halves
-       movq  [esp + .qqOO], mm4
-       movq  [esp + .qqOH], mm5
-       movq  [esp + .qqHH], mm6
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       mov   edx, ecx
-       imul  ecx, [ebp + %$ntype]
-       add   edx, ecx
-       mov   eax, [ebp + %$nbfp]
-       movd  mm0, [eax + edx*4]          
-       movd  mm1, [eax + edx*4 + 4]
-       movq  [esp + .c6], mm0
-       movq  [esp + .c12], mm1
-       movq  mm2, [mm_six]
-       movq  mm3, [mm_twelve]
-       movq  [esp + .six], mm2
-       movq  [esp + .twelve], mm3
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fixO],  mm7
-       movq  [esp + .fizO],  mm7
-       movq  [esp + .fixH],  mm7
-       movq  [esp + .fiyH],  mm7
-       movq  [esp + .fizH],  mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-
-       movd  mm6, [esp + .qqOO]
-       movq  mm7, [esp + .qqOH]
-
-       lea   eax, [eax + eax*2]
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm0
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt OO
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq OO
-
-       movq mm2, mm4
-       pfmul mm2, mm4
-       pfmul mm2, mm4
-       movq mm0, mm2
-       pfmul mm0,mm0
-       pfmul mm2, [esp + .c6]
-       pfmul mm0, [esp + .c12]
-       movq mm5, mm0
-       pfsub mm5, mm2          ; vnb
-
-       pfmul mm2, [esp + .six]
-       pfmul mm0, [esp + .twelve]
-
-       pfsub mm0, mm2
-       
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfadd mm0, mm6
-       pfmul mm4, mm0          ;  mm4=fscalar
-
-       ;;  update nb potential
-       pfadd mm5, [esp + .vnbtot]
-       movq [esp + .vnbtot], mm5
-
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-
-       ; interactions with j H1.
-       movq  mm0, [esi + eax*4 + 12]
-       movd  mm1, [esi + eax*4 + 20]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       movd mm6, [esp + .qqOH]
-       movq mm7, [esp + .qqHH]
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 12]
-       movd mm3,  [edi + eax*4 + 20]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 12], mm2
-       movd [edi + eax*4 + 20], mm3
-
-       ; interactions with j H2
-       movq  mm0, [esi + eax*4 + 24]
-       movd  mm1, [esi + eax*4 + 32]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-
-       movd mm6, [esp + .qqOH]
-       movq mm7, [esp + .qqHH]
-
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       movq  mm4, mm1
-       pfmul mm4, mm4          ;  mm4=invsq
-       ;;  calculate potential and scalar force
-       pfmul mm6, mm1          ; mm6=vcoul
-       pfmul mm4, mm6          ;  mm4=fscalar
-
-       pfrsqrt mm5, mm3
-       pswapd mm3,mm3
-       pfrsqrt mm2, mm3
-       pswapd mm3,mm3
-       punpckldq mm5,mm2       ;  seeds are in mm5 now, and rsq in mm3.
-
-       movq mm2, mm5
-       pfmul mm5,mm5
-        pfrsqit1 mm5,mm3                               
-        pfrcpit2 mm5,mm2       ;  mm5=invsqrt
-       movq mm3,mm5
-       pfmul mm3,mm3           ; mm3=invsq
-       pfmul mm7, mm5          ;  mm7=vcoul
-       pfmul mm3, mm7          ;  mm3=fscal for the two H's.
-
-       ;;  update vctot
-       pfadd mm7, mm6
-       pfadd mm7, [esp + .vctot]
-       movq [esp + .vctot], mm7
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm4,mm4
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm4
-       pfmul mm1, mm4
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm3
-       pfmul mm6, mm3
-       pfmul mm7, mm3
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4 
-
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 24]
-       movd mm3,  [edi + eax*4 + 32]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 24], mm2
-       movd [edi + eax*4 + 32], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop 
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnbtot[gid]
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 196
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl3000_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ix          equ     8
-.iy          equ    12
-.iz          equ    16
-.iq                 equ    20           ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    28           ; repeated (64bit) to fill 3dnow reg
-.two         equ    36           ; repeated (64bit) to fill 3dnow reg
-.n1          equ    44           ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    52           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    60
-.innerjjnr   equ    64
-.innerk      equ    68         
-.fix         equ    72
-.fiy         equ    76
-.fiz        equ    80
-.dx1        equ    84
-.dy1        equ    88
-.dz1        equ    92
-.dx2        equ    96
-.dy2        equ    100
-.dz2        equ    104                                         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 108            ;   local stack space
-       femms
-       ; move data to local stack 
-       movq  mm0, [mm_two]
-       movd  mm3, [ebp + %$tabscale]
-       movq  [esp + .two],    mm0
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3    
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm0, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm3, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx
-       pfadd mm1, mm3
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-                               
-       ;; clear total potential and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot],  mm7
-       movq  [esp + .fix],    mm7
-       movd  [esp + .fiz],    mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 2
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; coulomb table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3
-       pfmul mm1, [esp + .tabscale]    
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner: 
-       and [esp + .innerk], dword 1
-       jnz  .single_inner
-       jmp  .updateouterdata           
-.single_inner:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-       
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 108
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-       
-proc inl3010_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg
-%$tabscale      arg                    
-%$VFtab                arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.shX        equ     8
-.shY         equ    12 
-.shZ        equ    16  
-.ix          equ    20
-.iy          equ    24
-.iz          equ    28 
-.iq          equ    32         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    40          ; repeated (64bit) to fill 3dnow reg
-.two         equ    48         ; repeated (64bit) to fill 3dnow reg
-.n1          equ    56         ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    64         ; repeated (64bit) to fill 3dnow reg                    
-.innerjjnr0  equ    72
-.innerk0     equ    76         
-.innerjjnr   equ    80
-.innerk      equ    84         
-.fix         equ    88
-.fiy         equ    92
-.fiz        equ    96
-.dx1        equ    100
-.dy1        equ    104
-.dz1        equ    108
-.dx2        equ    112
-.dy2        equ    116
-.dz2        equ    120                                                         
-.nscoul      equ    124
-.solnr      equ    128         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 132            ;  local stack space
-       femms
-       
-       add   [ebp + %$nsatoms], dword 8
-       movq  mm2, [mm_two]
-       movq  [esp + .two], mm2
-       movd  mm3, [ebp + %$tabscale]
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3
-       
-       ;; assume we have at least one i particle - start directly              
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-       movq  [esp + .shX], mm0
-       movd  [esp + .shZ], mm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       mov   ecx, [eax]
-       add   [ebp + %$nsatoms], dword 12
-       mov   [esp + .nscoul], ecx
-               
-       ;; clear potential
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       mov   [esp + .solnr], ebx
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]
-       mov   ecx, [esp + .nscoul]
-       cmp   ecx, byte 0
-       jnz  .mno_coul
-       jmp  .last_mno
-.mno_coul:                             
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-.unroll_coul_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; coulomb table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3
-       pfmul mm1, [esp + .tabscale]    
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_coul_inner
-       jmp  .updateouterdata_coul              
-.single_coul_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-       
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_coul: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .last_mno
-       jmp .mno_coul
-.last_mno:     
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 132
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-       
-
-
-proc inl3020_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$tabscale      arg
-%$VFtab         arg
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.iqO         equ    44         ; repeated (64bit) to fill 3dnow reg
-.iqH         equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqO         equ    60         ; repeated (64bit) to fill 3dnow reg
-.qqH         equ    68         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    76          ; repeated (64bit) to fill 3dnow reg
-.two         equ    84          ; repeated (64bit) to fill 3dnow reg
-.n1          equ    92          ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    100          ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    108
-.innerk      equ    112        
-.fixO        equ    116
-.fiyO        equ    120
-.fizO        equ    124
-.fixH        equ    128         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    136         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    144         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    152
-.dyO        equ    156
-.dzO        equ    160
-.dxH        equ    164         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    172         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    180         ; repeated (64bit) to fill 3dnow reg
-.tmprsqH     equ    188                ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 196            ;   local stack space
-       femms
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0]
-       pfmul mm2, mm1          
-       movq  [esp + .iqO], mm2         ;  iqO = facel*charge[ii]
-       
-       movd  mm2, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1]
-       pfmul mm2, mm1
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iqH], mm2         ;  iqH = facel*charge[ii0+1]
-
-       movq  mm3, [mm_two]
-       movd  mm4, [ebp + %$tabscale]
-       punpckldq mm4,mm4               ;  spread to both halves
-       movq  [esp + .two],    mm3
-       movq  [esp + .tabscale], mm4          
-       ;; assume we have at least one i particle - start directly       
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-                                       
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .fixO],   mm7
-       movd  [esp + .fizO],   mm7
-       movq  [esp + .fixH],   mm7
-       movq  [esp + .fiyH],   mm7
-       movq  [esp + .fizH],   mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:   
-       ;; a single j particle iteration.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-       ;prefetch [ecx + 16]       ; prefetch data - trial and error says 16 is best
-
-       mov ecx, [ebp + %$charge]
-       movd mm7, [ecx + eax*4]
-       punpckldq mm7,mm7
-       movq mm6,mm7
-       pfmul mm6, [esp + .iqO]
-       pfmul mm7, [esp + .iqH] ;  mm6=qqO, mm7=qqH
-       movd [esp + .qqO], mm6
-       movq [esp + .qqH], mm7
-               
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-       
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-
-       pfmul mm0, mm1          ;  mm0=r
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqO] ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqO] ; fijC=qq*FF
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       movq mm3, mm7   
-
-       ; change sign of fscal and multiply with rinv
-        pxor mm0,mm0
-       pfsubr mm3, mm0 
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now         
-       
-       ;; Ready with the oxygen - potential is updated, fscal is in mm3.
-       ;; now do the two hydrogens.
-       movq mm0, [esp + .tmprsqH] ; mm0=rsqH
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqH] ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqH] ; fijC=qq*FF
-
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 + 8], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3, mm3 
-       ;; mm1 is fzH1
-       ;; mm3 is fzH2
-       
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 196
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-       
-
-proc inl3030_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$tabscale      arg
-%$VFtab         arg
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.qqOO        equ    44         ; repeated (64bit) to fill 3dnow reg
-.qqOH        equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqHH        equ    60         ; repeated (64bit) to fill 3dnow reg
-.two         equ    68         ; repeated (64bit) to fill 3dnow reg
-.n1         equ    76          ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    84         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    92          ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    100
-.innerk      equ    104        
-.fixO        equ    108
-.fiyO        equ    112
-.fizO        equ    116
-.fixH        equ    120         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    128         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    136         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    144
-.dyO        equ    148
-.dzO        equ    152
-.dxH        equ    156         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    164         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    172         ; repeated (64bit) to fill 3dnow reg
-.tmprsqH     equ    180         ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 188            ;   local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]      ;  mm1=facel
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0] (O)
-       movd  mm3, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1] (H)
-       movq  mm4, mm2  
-       pfmul mm4, mm1
-       movq  mm6, mm3
-       pfmul mm6, mm1
-       movq  mm5, mm4
-       pfmul mm4, mm2                  ; mm4=qqOO*facel
-       pfmul mm5, mm3                  ; mm5=qqOH*facel
-       pfmul mm6, mm3                  ; mm6=qqHH*facel
-       punpckldq mm5,mm5               ;  spread to both halves
-       punpckldq mm6,mm6               ;  spread to both halves
-       movq  [esp + .qqOO], mm4
-       movq  [esp + .qqOH], mm5
-       movq  [esp + .qqHH], mm6
-       movq  mm2, [mm_two]
-       movq  [esp + .two], mm2
-       movd  mm3, [ebp + %$tabscale]
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .fixO],  mm7
-       movq  [esp + .fizO],  mm7
-       movq  [esp + .fixH],  mm7
-       movq  [esp + .fiyH],  mm7
-       movq  [esp + .fizH],  mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm0
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt OO
-       pfmul mm0, mm1          ;  mm0=rsq OO
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOO]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOO]        ; fijC=qq*FF
-
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       movq mm3, mm7
-
-       ; change sign of fscal and multiply with rinv
-        pxor mm0,mm0
-       pfsubr mm3, mm0 
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-       
-       ;; Ready with the oxygen - potential is updated, fscal is in mm3.
-       ;; time for hydrogens!
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-
-       ; interactions with j H1.
-
-       movq  mm0, [esi + eax*4 + 12]
-       movd  mm1, [esi + eax*4 + 20]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       pfmul mm0, mm1          ;  mm0=rsq 
-       
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-
-       ;; update vctot directly, force is moved to mm3.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       pxor mm3, mm3
-       pfsub mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqHH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqHH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now              
-
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 12]
-       movd mm3,  [edi + eax*4 + 20]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 12], mm2
-       movd [edi + eax*4 + 20], mm3
-
-       ; interactions with j H2
-       movq  mm0, [esi + eax*4 + 24]
-       movd  mm1, [esi + eax*4 + 32]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       pfmul mm0, mm1
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       pxor mm3,mm3
-       pfsub mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqHH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqHH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4 
-
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 24]
-       movd mm3,  [edi + eax*4 + 32]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 24], mm2
-       movd [edi + eax*4 + 32], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop 
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 188
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl3100_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-.is3         equ     0 
-.ii3         equ     4
-.ix          equ     8
-.iy          equ    12
-.iz          equ    16
-.iq                 equ    20           ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    28           ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    36           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    44           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    52           ; repeated (64bit) to fill 3dnow reg
-.six         equ    60           ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    68           ; repeated (64bit) to fill 3dnow reg
-.two         equ    76           ; repeated (64bit) to fill 3dnow reg
-.n1          equ    84           ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    92           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    100
-.innerjjnr   equ    104
-.innerk      equ    108        
-.fix         equ    112
-.fiy         equ    116
-.fiz        equ    120
-.dx1        equ    124
-.dy1        equ    128
-.dz1        equ    132
-.dx2        equ    136
-.dy2        equ    140
-.dz2        equ    144                                         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 148            ;   local stack space
-       femms
-       ; move data to local stack 
-       movq  mm0, [mm_two]
-       movq  mm1, [mm_six]
-       movq  mm2, [mm_twelve]
-       movd  mm3, [ebp + %$tabscale]
-       movq  [esp + .two],    mm0
-       movq  [esp + .six],    mm1
-       movq  [esp + .twelve],    mm2
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3    
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm0, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm3, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx
-       pfadd mm1, mm3
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-                               
-       ;; clear total potential and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot],  mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fix],    mm7
-       movd  [esp + .fiz],    mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 2
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; coulomb table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       movq mm1, mm0
-       pfmul mm1,mm1   ; mm1=invsq
-       movq mm2, mm1
-       pfmul mm2,mm1
-       pfmul mm2,mm1   ; mm2=rinvsix
-       movq  mm1,mm2
-       pfmul mm1,mm1   ; mm1=rinvtwelve
-       
-       pfmul mm3, [esp + .tabscale]
-       
-       pfmul mm1, [esp + .c12]
-
-       pfmul mm2, [esp + .c6]
-
-       movq mm4, mm1
-       pfsub mm4, mm2  ; mm4 = vnb12-vnb6
-
-       pfmul mm2, [esp + .six]
-       pfmul mm1, [esp + .twelve]
-
-       pfsub mm1, mm2
-       pfmul mm1, mm0  ; mm1=  (12*vnb12-6*vnb6)*rinv11
-
-       pfsub mm1, mm3
-
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm4, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm4       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner: 
-       and [esp + .innerk], dword 1
-       jnz  .single_inner
-       jmp  .updateouterdata           
-.single_inner:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm4=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-       
-       ; at this point mm5 contains vcoul and mm3 fijC.
-
-       movq mm1, mm0
-       pfmul mm1,mm1   ; mm1=invsq
-       movq mm2, mm1
-       pfmul mm2,mm1
-       pfmul mm2,mm1   ; mm2=rinvsix
-       movq  mm1,mm2
-       pfmul mm1,mm1   ; mm1=rinvtwelve
-       
-       pfmul mm3, [esp + .tabscale]
-       
-       pfmul mm1, [esp + .c12]
-
-       pfmul mm2, [esp + .c6]
-
-       movq mm4, mm1
-       pfsub mm4, mm2  ; mm4 = vnb12-vnb6
-
-       pfmul mm2, [esp + .six]
-       pfmul mm1, [esp + .twelve]
-
-       pfsub mm1, mm2
-       pfmul mm1, mm0  ; mm1=  (12*vnb12-6*vnb6)*rinv11
-
-       pfsub mm1, mm3
-
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm4, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm4       ;  store the sum       
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix] 
-       pfadd mm7, [esp + .fiz] 
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
- 
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb] 
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 148
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-
-
-
-proc inl3110_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.shX        equ     8
-.shY         equ    12 
-.shZ        equ    16  
-.ix          equ    20
-.iy          equ    24
-.iz          equ    28 
-.iq          equ    32          ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    40           ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    48           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    56           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    64           ; repeated (64bit) to fill 3dnow reg
-.six         equ    72           ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    80           ; repeated (64bit) to fill 3dnow reg
-.two         equ    88           ; repeated (64bit) to fill 3dnow reg
-.n1          equ    96           ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    104           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    112
-.innerjjnr0  equ    116
-.innerk0     equ    120        
-.innerjjnr   equ    124
-.innerk      equ    128        
-.fix         equ    132
-.fiy         equ    136
-.fiz        equ    140
-.dx1        equ    144
-.dy1        equ    148
-.dz1        equ    152
-.dx2        equ    156
-.dy2        equ    160
-.dz2        equ    164                                                         
-.nsvdwc      equ    168
-.nscoul      equ    172
-.nsvdw       equ    176
-.solnr      equ    180         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 184            ;  local stack space
-       femms
-       movq  mm0, [mm_six]
-       movq  mm1, [mm_twelve]
-       movq  [esp + .six],    mm0
-       movq  [esp + .twelve], mm1
-       movq  mm2, [mm_two]
-       movd  mm3, [ebp + %$tabscale]
-       movq  [esp + .two],    mm2
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3    
-       ;; assume we have at least one i particle - start directly              
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-       movq  [esp + .shX], mm0
-       movd  [esp + .shZ], mm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       pxor  mm7,mm7
-       movq  [esp + .vctot],  mm7
-       movq  [esp + .vnbtot], mm7
-       mov   [esp + .solnr],  ebx
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]
-       
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testcoul
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; coulomb table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       movq mm1, mm0
-       pfmul mm1,mm1   ; mm1=invsq
-       movq mm2, mm1
-       pfmul mm2,mm1
-       pfmul mm2,mm1   ; mm2=rinvsix
-       movq  mm1,mm2
-       pfmul mm1,mm1   ; mm1=rinvtwelve
-       
-       pfmul mm3, [esp + .tabscale]
-       
-       pfmul mm1, [esp + .c12]
-
-       pfmul mm2, [esp + .c6]
-
-       movq mm4, mm1
-       pfsub mm4, mm2  ; mm4 = vnb12-vnb6
-
-       pfmul mm2, [esp + .six]
-       pfmul mm1, [esp + .twelve]
-
-       pfsub mm1, mm2
-       pfmul mm1, mm0  ; mm1=  (12*vnb12-6*vnb6)*rinv11
-
-       pfsub mm1, mm3
-
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm4, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm4       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7     
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_vdwc_inner
-       jmp  .updateouterdata_vdwc              
-.single_vdwc_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm4=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       movq mm1, mm0
-       pfmul mm1,mm1   ; mm1=invsq
-       movq mm2, mm1
-       pfmul mm2,mm1
-       pfmul mm2,mm1   ; mm2=rinvsix
-       movq  mm1,mm2
-       pfmul mm1,mm1   ; mm1=rinvtwelve
-       
-       pfmul mm3, [esp + .tabscale]
-       
-       pfmul mm1, [esp + .c12]
-
-       pfmul mm2, [esp + .c6]
-
-       movq mm4, mm1
-       pfsub mm4, mm2  ; mm4 = vnb12-vnb6
-
-       pfmul mm2, [esp + .six]
-       pfmul mm1, [esp + .twelve]
-
-       pfsub mm1, mm2
-       pfmul mm1, mm0  ; mm1=  (12*vnb12-6*vnb6)*rinv11
-
-       pfsub mm1, mm3
-
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm4, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm4       ;  store the sum       
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdwc: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testcoul
-       jmp .mno_vdwc
-.testcoul
-       mov  ecx, [esp + .nscoul]
-       cmp  ecx, byte 0
-       jnz  .mno_coul
-       jmp  .testvdw
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-.unroll_coul_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; coulomb table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3
-       pfmul mm1, [esp + .tabscale]    
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_coul_inner
-       jmp  .updateouterdata_coul              
-.single_coul_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-       
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_coul: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .testvdw
-       jmp .mno_coul
-.testvdw
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best      
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrcp mm0, mm4                  ; lookup reciprocal seed 
-        pfrcp mm1, mm6
- 
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-                                       ; amd 3dnow N-R iteration to get full precision.
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       
-       ;; mm4 now contains invsq,
-       ;; do potential and fscal
-       movq  mm0, mm4
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5,mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:     
-       and [esp + .innerk], dword 1
-       jnz  .single_vdw_inner
-       jmp  .updateouterdata_vdw               
-.single_vdw_inner:     
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm4=rsq
-       
-        pfrcp mm0,mm4
-        pfrcpit1 mm4,mm0                               
-        pfrcpit2 mm4,mm0       ;  mm4=invsq 
-       ;;  calculate potentials and scalar force
-       movq  mm0, mm4
-
-       pfmul mm4, mm0
-       pfmul mm4, mm0                  ; mm4=rinvsix
-       movq  mm5, mm4  
-       pfmul mm5, mm5                  ; mm5=rinvtwelve
-
-       pfmul mm5, [esp + .c12]
-       pfmul mm4, [esp + .c6]  
-       movq mm6, mm5   ; mm6 is vnb12-vnb6
-       pfsub mm6, mm4
-
-       pfmul mm4, [esp + .six]
-
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5, mm4
-       pfmul mm0, mm5        ; mm0 is total fscal now
-
-       ;; update vnbtot
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdw:  
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-       
-.last_mno:     
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 184
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-
-proc inl3120_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.iqO         equ    44         ; repeated (64bit) to fill 3dnow reg
-.iqH         equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqO         equ    60         ; repeated (64bit) to fill 3dnow reg
-.qqH         equ    68         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    76          ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    84          ; repeated (64bit) to fill 3dnow reg
-.c6          equ    92          ; repeated (64bit) to fill 3dnow reg
-.c12         equ    100         ; repeated (64bit) to fill 3dnow reg
-.six         equ    108         ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    116         ; repeated (64bit) to fill 3dnow reg
-.two         equ    124         ; repeated (64bit) to fill 3dnow reg
-.n1          equ    132         ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    140         ; repeated (64bit) to fill 3dnow reg
-.ntia        equ    148         ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    156
-.innerk      equ    160        
-.fixO        equ    164
-.fiyO        equ    168
-.fizO        equ    172
-.fixH        equ    176         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    184         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    192         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    200
-.dyO        equ    204
-.dzO        equ    208
-.dxH        equ    212         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    220         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    228         ; repeated (64bit) to fill 3dnow reg
-.tmprsqH     equ    236                ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 244            ;   local stack space
-       femms
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0]
-       pfmul mm2, mm1
-       movq  [esp + .iqO], mm2         ;  iqO = facel*charge[ii]
-       
-       movd  mm2, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1]
-       pfmul mm2, mm1
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iqH], mm2         ;  iqH = facel*charge[ii0+1]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]
-       shl   edx, 1
-       mov   ecx, edx                  
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       mov   [esp + .ntia], ecx
-               
-       movq  mm3, [mm_two]
-       movq  mm4, [mm_six]
-       movq  mm5, [mm_twelve]
-       movq  mm6, [ebp + %$tabscale]
-       punpckldq mm6,mm6               ;  spread to both halves
-       movq  [esp + .two], mm3
-       movq  [esp + .six], mm4
-       movq  [esp + .twelve], mm5
-       movq  [esp + .tabscale], mm6          
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-                                       
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fixO],   mm7
-       movd  [esp + .fizO],   mm7
-       movq  [esp + .fixH],   mm7
-       movq  [esp + .fiyH],   mm7
-       movq  [esp + .fizH],   mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:   
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-       ;prefetch [ecx + 16]       ; prefetch data - trial and error says 16 is best
-
-       mov ecx, [ebp + %$charge]
-       movd mm7, [ecx + eax*4]
-       punpckldq mm7,mm7
-       movq mm6,mm7
-       pfmul mm6, [esp + .iqO]
-       pfmul mm7, [esp + .iqH] ;  mm6=qqO, mm7=qqH
-       movd [esp + .qqO], mm6
-       movq [esp + .qqH], mm7
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr]
-       mov ecx, [ebp + %$nbfp]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [ecx + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [ecx + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5  
-                       
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-       
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-
-       pfmul mm0, mm1          ;  mm0=r
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqO] ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqO] ; fijC=qq*FF
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-
-       movq mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       
-       ; nontabulated LJ - mm1 is invsqrt. - keep mm1!
-       movq mm0, mm1
-       pfmul mm0, mm0          ;  mm0 is invsq
-       movq mm2, mm0
-       pfmul mm2, mm0
-       pfmul mm2, mm0          ; mm2 = rinvsix
-       movq mm4, mm2
-       pfmul mm4, mm4          ;  mm4=rinvtwelve
-
-       pfmul mm4, [esp + .c12]
-       pfmul mm2, [esp + .c6]
-       movq mm5, mm4
-       pfsub mm5, mm2          ; mm5=vnb12-vnb6
-
-       pfmul mm2, [esp + .six]
-       pfmul mm4, [esp + .twelve]
-       pfsub mm4, mm2
-       pfmul mm4, mm1        ; mm4=(12*vnb12-6*vnb6)*rinv11
-
-       pfsubr mm3, mm4 
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-       
-       ;; update vnbtot 
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-       
-       ;; Ready with the oxygen - potential is updated, fscal is in mm3.
-       ;; now do the two hydrogens.
-       movq mm0, [esp + .tmprsqH] ; mm0=rsqH
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqH] ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqH] ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7
-       pfmul mm4, [esp + .tabscale]    
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-       
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  same for Vnb.
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 244
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-
-proc inl3130_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.qqOO        equ    44         ; repeated (64bit) to fill 3dnow reg
-.qqOH        equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqHH        equ    60         ; repeated (64bit) to fill 3dnow reg
-.c6          equ    68         ; repeated (64bit) to fill 3dnow reg
-.c12         equ    76         ; repeated (64bit) to fill 3dnow reg
-.six         equ    84         ; repeated (64bit) to fill 3dnow reg
-.twelve      equ    92         ; repeated (64bit) to fill 3dnow reg
-.two         equ    100        ; repeated (64bit) to fill 3dnow reg
-.n1         equ    108         ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    116        ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    124         ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    132         ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    140
-.innerk      equ    144        
-.fixO        equ    148
-.fiyO        equ    152
-.fizO        equ    156
-.fixH        equ    160         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    168         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    176         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    184
-.dyO        equ    188
-.dzO        equ    192
-.dxH        equ    200         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    208         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    216         ; repeated (64bit) to fill 3dnow reg
-.tmprsqH     equ    224         ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 232            ;   local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]      ;  mm1=facel
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0] (O)
-       movd  mm3, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1] (H)
-       movq  mm4, mm2  
-       pfmul mm4, mm1
-       movq  mm6, mm3
-       pfmul mm6, mm1
-       movq  mm5, mm4
-       pfmul mm4, mm2                  ; mm4=qqOO*facel
-       pfmul mm5, mm3                  ; mm5=qqOH*facel
-       pfmul mm6, mm3                  ; mm6=qqHH*facel
-       punpckldq mm5,mm5               ;  spread to both halves
-       punpckldq mm6,mm6               ;  spread to both halves
-       movq  [esp + .qqOO], mm4
-       movq  [esp + .qqOH], mm5
-       movq  [esp + .qqHH], mm6
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       mov   edx, ecx
-       imul  ecx, [ebp + %$ntype]
-       add   edx, ecx
-       mov   eax, [ebp + %$nbfp]
-       movd  mm0, [eax + edx*4]
-       movd  mm1, [eax + edx*4 + 4]
-       movq  [esp + .c6], mm0
-       movq  [esp + .c12], mm1
-       movq  mm2, [mm_two]
-       movq  mm3, [mm_six]
-       movq  mm4, [mm_twelve]
-       movq  [esp + .two], mm2
-       movq  [esp + .six], mm3
-       movq  [esp + .twelve], mm4
-       movd  mm5, [ebp + %$tabscale]
-       punpckldq mm5,mm5
-       movq  [esp + .tabscale], mm5
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fixO],  mm7
-       movq  [esp + .fizO],  mm7
-       movq  [esp + .fixH],  mm7
-       movq  [esp + .fiyH],  mm7
-       movq  [esp + .fizH],  mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm0
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt OO
-       pfmul mm0, mm1          ;  mm0=rsq OO
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOO]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOO]        ; fijC=qq*FF
-
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       movq mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       
-       movq mm5, mm1
-       pfmul mm5,mm5
-       movq mm4, mm5
-       pfmul mm4,mm5
-       pfmul mm4,mm5
-       movq mm5, mm4
-       pfmul mm5,mm5   ; mm4=rinvsix, mm5=rinvtwelve
-
-       pfmul mm4, [esp + .c6]
-       pfmul mm5, [esp + .c12]
-       movq mm6,mm5
-       pfsub mm6,mm4
-
-       pfmul mm4, [esp + .six]
-       pfmul mm5, [esp + .twelve]
-       pfsub mm5,mm4
-       pfmul mm5, mm1
-       pfsubr mm3, mm5
-
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-       
-       ;; update vnbtot 
-       pfadd mm6, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm6       ;  store the sum       
-       
-       ;; Ready with the oxygen - potential is updated, fscal is in mm3.
-       ;; time for hydrogens!
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-
-       ; interactions with j H1.
-
-       movq  mm0, [esi + eax*4 + 12]
-       movd  mm1, [esi + eax*4 + 20]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       pfmul mm0, mm1          ;  mm0=rsq 
-       
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-
-       ;; update vctot directly, force is moved to mm3.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       pxor mm3, mm3
-       pfsub mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqHH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqHH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now              
-
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 12]
-       movd mm3,  [edi + eax*4 + 20]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 12], mm2
-       movd [edi + eax*4 + 20], mm3
-
-       ; interactions with j H2
-       movq  mm0, [esi + eax*4 + 24]
-       movd  mm1, [esi + eax*4 + 32]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       pfmul mm0, mm1
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       pxor mm3,mm3
-       pfsub mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqHH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqHH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4 
-
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 24]
-       movd mm3,  [edi + eax*4 + 32]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 24], mm2
-       movd [edi + eax*4 + 32], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop 
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnbtot[gid]
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 232
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-proc inl3300_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ix          equ     8
-.iy          equ    12
-.iz          equ    16
-.iq                 equ    20           ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    28           ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    36           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    44           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    52           ; repeated (64bit) to fill 3dnow reg
-.two         equ    60           ; repeated (64bit) to fill 3dnow reg
-.n1          equ    68           ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    76           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    84
-.innerjjnr   equ    88
-.innerk      equ    92         
-.fix         equ    96
-.fiy         equ    100
-.fiz        equ    104
-.dx1        equ    108
-.dy1        equ    112
-.dz1        equ    116
-.dx2        equ    120
-.dy2        equ    124
-.dz2        equ    128                                         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 132            ;   local stack space
-       femms
-       ; move data to local stack 
-       movq  mm0, [mm_two]
-       movd  mm3, [ebp + %$tabscale]
-       movq  [esp + .two],    mm0
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3    
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm0, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm3, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx
-       pfadd mm1, mm3
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-                               
-       ;; clear total potential and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot],  mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fix],    mm7
-       movd  [esp + .fiz],    mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 2
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       ; dispersion table
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4 + 16]
-       punpckldq mm5, [edx + ecx*4 + 20]
-       punpckldq mm6, [edx + ecx*4 + 24]
-       punpckldq mm7, [edx + ecx*4 + 28]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       pfadd mm3, mm7  ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 32]
-       movd mm5, [edx + ecx*4 + 36]
-       movd mm6, [edx + ecx*4 + 40]
-       movd mm7, [edx + ecx*4 + 44]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4 + 32]
-       punpckldq mm5, [edx + ecx*4 + 36]
-       punpckldq mm6, [edx + ecx*4 + 40]
-       punpckldq mm7, [edx + ecx*4 + 44]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner: 
-       and [esp + .innerk], dword 1
-       jnz  .single_inner
-       jmp  .updateouterdata           
-.single_inner:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-       
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       pfadd mm3, mm7  ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 32]
-       movd mm5, [edx + ecx*4 + 36]
-       movd mm6, [edx + ecx*4 + 40]
-       movd mm7, [edx + ecx*4 + 44]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 132
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-
-proc inl3310_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.shX        equ     8
-.shY         equ    12 
-.shZ        equ    16  
-.ix          equ    20
-.iy          equ    24
-.iz          equ    28 
-.iq          equ    32          ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    40           ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    48           ; repeated (64bit) to fill 3dnow reg
-.c6          equ    56           ; repeated (64bit) to fill 3dnow reg
-.c12         equ    64           ; repeated (64bit) to fill 3dnow reg
-.two         equ    72           ; repeated (64bit) to fill 3dnow reg
-.n1          equ    80           ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    88           ; repeated (64bit) to fill 3dnow reg
-.ntia       equ    96  
-.innerjjnr0  equ    100
-.innerk0     equ    104        
-.innerjjnr   equ    108
-.innerk      equ    112        
-.fix         equ    116
-.fiy         equ    120
-.fiz        equ    124
-.dx1        equ    128
-.dy1        equ    132
-.dz1        equ    136
-.dx2        equ    140
-.dy2        equ    144
-.dz2        equ    148                                                         
-.nsvdwc      equ    152
-.nscoul      equ    156
-.nsvdw       equ    160
-.solnr      equ    164         
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 168            ;  local stack space
-       femms
-       movq  mm0, [mm_two]
-       movd  mm3, [ebp + %$tabscale]
-       movq  [esp + .two],    mm0
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3    
-       ;; assume we have at least one i particle - start directly              
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm0, [eax + ebx*4]        ;  move shX/shY to mm0 and shZ to mm1.
-       movd  mm1, [eax + ebx*4 + 8]
-       movq  [esp + .shX], mm0
-       movd  [esp + .shZ], mm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       pxor  mm7,mm7
-       movq  [esp + .vctot],  mm7
-       movq  [esp + .vnbtot], mm7
-       mov   [esp + .solnr],  ebx
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]
-       
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testcoul
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       ; dispersion table
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4 + 16]
-       punpckldq mm5, [edx + ecx*4 + 20]
-       punpckldq mm6, [edx + ecx*4 + 24]
-       punpckldq mm7, [edx + ecx*4 + 28]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       pfadd mm3, mm7  ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 32]
-       movd mm5, [edx + ecx*4 + 36]
-       movd mm6, [edx + ecx*4 + 40]
-       movd mm7, [edx + ecx*4 + 44]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4 + 32]
-       punpckldq mm5, [edx + ecx*4 + 36]
-       punpckldq mm6, [edx + ecx*4 + 40]
-       punpckldq mm7, [edx + ecx*4 + 44]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_vdwc_inner
-       jmp  .updateouterdata_vdwc              
-.single_vdwc_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-       
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       pfadd mm3, mm7  ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 32]
-       movd mm5, [edx + ecx*4 + 36]
-       movd mm6, [edx + ecx*4 + 40]
-       movd mm7, [edx + ecx*4 + 44]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdwc: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testcoul
-       jmp .mno_vdwc
-.testcoul
-       mov  ecx, [esp + .nscoul]
-       cmp  ecx, byte 0
-       jnz  .mno_coul
-       jmp  .testvdw
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-       mov   edx, [ebp + %$charge]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii]
-       pfmul mm2, [ebp + %$facel]
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iq], mm2          ;  iq =facel*charge[ii]
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-.unroll_coul_loop:     
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$charge]        ; base of charge[]
-       movq mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]          ; charge[jnr1]
-        punpckldq mm3, [ecx + ebx*4]     ; move charge 2 to high part of mm3 
-       pfmul mm3,mm5                    ;  mm3 now has qq for both particles
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; coulomb table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3
-       pfmul mm1, [esp + .tabscale]    
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:    
-       and [esp + .innerk], dword 1
-       jnz  .single_coul_inner
-       jmp  .updateouterdata_coul              
-.single_coul_inner:    
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov ecx, [ebp + %$charge]
-       movd mm5, [esp + .iq]
-       movd mm3, [ecx + eax*4]
-       pfmul mm3, mm5          ;  mm3=qq
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, mm3  ; vcoul=qq*VV
-       pfmul mm3, mm7  ; fijC=FF*qq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       pfadd mm5, [esp + .vctot]      ; add the earlier value
-       movq [esp + .vctot], mm5       ;  store the sum       
-       
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_coul: 
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .testvdw
-       jmp .mno_coul
-.testvdw
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$type]       
-       mov   edx, [edx + ebx*4]        
-       imul  edx, [ebp + %$ntype]
-       shl   edx, 1
-       mov   [esp + .ntia], edx        
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-       
-       movq  mm0, [eax + ebx*4]
-       movd  mm1, [eax + ebx*4 + 8]
-       pfadd mm0, [esp + .shX]
-       pfadd mm1, [esp + .shZ]
-       movq  [esp + .ix], mm0  
-       movd  [esp + .iz], mm1  
-
-       ;; clear forces.
-       pxor  mm7,mm7
-       movq  [esp + .fix],   mm7
-       movd  [esp + .fiz],   mm7
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 2
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; paired innerloop here.
-       mov   ecx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]             ; eax/ebx=jnr 
-       add   [esp + .innerjjnr], dword 8 ; advance pointer (unrolled 2) 
-       prefetch [ecx + 16]              ; prefetch data - trial and error says 16 is best
-       
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       mov ecx, [ecx + ebx*4]           ; type [jnr2]
-
-       mov esi, [ebp + %$nbfp]          ; base of nbfp 
-       shl edx, 1
-       shl ecx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       add ecx, [esp + .ntia]
-
-       movq mm5, [esi + edx*4]         ; mm5 = 1st c6 / c12            
-       movq mm7, [esi + ecx*4]         ; mm7 = 2nd c6 / c12    
-       movq mm6,mm5                    
-       punpckldq mm5,mm7               ; mm5 = 1st c6 / 2nd c6  
-       punpckhdq mm6,mm7               ; mm6 = 1st c12 / 2nd c12
-       movq [esp + .c6], mm5
-       movq [esp + .c12], mm6
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]                
-
-       mov   esi, [ebp + %$pos]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]          
-       movq  mm4, [esi + eax*4]         ;  fetch first j coordinates 
-       movd  mm5, [esi + eax*4 + 8]            
-       pfsubr mm4,mm0                   ;  dr = ir - jr 
-       pfsubr mm5,mm1
-       movq  [esp + .dx1], mm4          ;  store dr
-       movd  [esp + .dz1], mm5
-       pfmul mm4,mm4                    ;  square dx,dy,dz                      
-       pfmul mm5,mm5           
-       pfacc mm4, mm5                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm4, mm5                   ;  first rsq in lower mm4
-
-       movq  mm6, [esi + ebx*4]         ;  fetch second j coordinates 
-       movd  mm7, [esi + ebx*4 + 8]
-       
-       pfsubr mm6,mm0                   ;  dr = ir - jr 
-       pfsubr mm7,mm1
-       movq  [esp + .dx2], mm6          ;  store dr
-       movd  [esp + .dz2], mm7
-       pfmul mm6,mm6                    ;  square dx,dy,dz
-       pfmul mm7,mm7
-       pfacc mm6, mm7                   ;  accumulate to get dx*dx+dy*dy+dz*dz
-       pfacc mm6, mm7                   ;  second rsq in lower mm6
-
-        pfrsqrt mm0, mm4                ; lookup inverse square root seed 
-        pfrsqrt mm1, mm6
- 
-
-       punpckldq mm0,mm1
-       punpckldq mm4,mm6               ;  now 4 has rsq and 0 the seed for both pairs.
-        movq mm2,mm0                   ; amd 3dnow N-R iteration to get full precision.
-       pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-       ;; do potential and fscal
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-       
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       ; dispersion table
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]  
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4 + 16]
-       punpckldq mm5, [edx + ecx*4 + 20]
-       punpckldq mm6, [edx + ecx*4 + 24]
-       punpckldq mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm1, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       prefetchw [esp + .dx1]  ;  prefetch i forces to cache
-
-       ;;  spread fscalar to both positions
-       movq mm1,mm0
-       punpckldq mm0,mm0
-       punpckhdq mm1,mm1
-
-       ;; calc vector force
-       prefetchw [edi + eax*4] ; prefetch the 1st faction to cache
-       movq mm2,  [esp + .dx1] ; fetch dr
-       movd mm3,  [esp + .dz1]
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       prefetchw [edi + ebx*4] ; prefetch the 2nd faction to cache
-       pfmul mm2, mm0          ; mult by fs 
-       pfmul mm3, mm0
-
-       movq mm4,  [esp + .dx2]         ; fetch dr
-       movd mm5,  [esp + .dz2]
-       pfmul mm4, mm1          ; mult by fs 
-       pfmul mm5, mm1
-       ;; update i forces
-
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-
-       pfadd mm0, mm4
-       pfadd mm1, mm5
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j forces
-
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax*4 + 8]
-       movq mm6,  [edi + ebx*4]
-       movd mm7,  [edi + ebx*4 + 8]
-       
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       pfsub mm6, mm4
-       pfsub mm7, mm5
-       
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       movq [edi + ebx*4], mm6
-       movd [edi + ebx*4 + 8], mm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 2
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:     
-       and [esp + .innerk], dword 1
-       jnz  .single_vdw_inner
-       jmp  .updateouterdata_vdw               
-.single_vdw_inner:     
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-
-       mov esi, [ebp + %$nbfp]
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr1]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [esi + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [esi + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5
-
-       mov   esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esp + .ix]
-       movd  mm1, [esp + .iz]
-       movq  mm4, [esi + eax*4]
-       movd  mm5, [esi + eax*4 + 8]
-       pfsubr mm4, mm0
-       pfsubr mm5, mm1
-       movq  [esp + .dx1], mm4
-       pfmul mm4,mm4
-       movd  [esp + .dz1], mm5 
-       pfmul mm5,mm5
-       pfacc mm4, mm5
-       pfacc mm4, mm5          ;  mm0=rsq
-       
-        pfrsqrt mm0,mm4
-        movq mm2,mm0
-        pfmul mm0,mm0
-        pfrsqit1 mm0,mm4                               
-        pfrcpit2 mm0,mm2       ;  mm1=invsqrt
-       pfmul mm4, mm0
-       movq mm1, mm4
-       ;; mm0 is invsqrt, and mm1 r.
-
-       ;;  calculate potentials and scalar force
-       pfmul mm1, [esp + .tabscale]    ; mm1=rt
-       pf2iw mm4,mm1
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm1, mm4                   ; now mm1 is eps and mm4 n0.
-
-       movq mm2,mm1
-       pfmul mm2,mm2   ; mm1 is eps, mm2 is eps2
-       
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]  
-       shl ecx, 2
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       movq mm3, mm7   ; add to fscal
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-
-       pfmul mm6, mm1  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm1  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of mm3
-        pxor mm1,mm1
-       pfsub mm1, mm3  
-       pfmul mm0, [esp + .tabscale]
-       pfmul mm0, mm1        ; mm0 is total fscal now  
-
-       ;; update vnbtot
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ;;  spread fscalar to both positions
-       punpckldq mm0,mm0
-       ;;  calc vectorial force
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm2,  [esp + .dx1]
-       movd mm3,  [esp + .dz1]
-
-       pfmul mm2, mm0
-       pfmul mm3, mm0
-
-       ;; update i particle force
-       movq mm0,  [esp + .fix]
-       movd mm1,  [esp + .fiz]
-       pfadd mm0, mm2
-       pfadd mm1, mm3
-       movq [esp + .fix], mm0
-       movd [esp + .fiz], mm1
-       ;; update j particle force
-       movq mm0,  [edi + eax*4]
-       movd mm1,  [edi + eax *4+ 8]
-       pfsub mm0, mm2
-       pfsub mm1, mm3
-       movq [edi + eax*4], mm0
-       movd [edi + eax*4 +8], mm1
-       ;;  done!
-.updateouterdata_vdw:  
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment i force
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fix]
-       pfadd mm7, [esp + .fiz]
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-       
-.last_mno:     
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       femms
-       add esp, 168
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-proc inl3320_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.iqO         equ    44         ; repeated (64bit) to fill 3dnow reg
-.iqH         equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqO         equ    60         ; repeated (64bit) to fill 3dnow reg
-.qqH         equ    68         ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    76          ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    84          ; repeated (64bit) to fill 3dnow reg
-.c6          equ    92          ; repeated (64bit) to fill 3dnow reg
-.c12         equ    100          ; repeated (64bit) to fill 3dnow reg
-.two         equ    108         ; repeated (64bit) to fill 3dnow reg
-.n1          equ    116         ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    124         ; repeated (64bit) to fill 3dnow reg
-.ntia        equ    132         ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    140
-.innerk      equ    144        
-.fixO        equ    148
-.fiyO        equ    152
-.fizO        equ    156
-.fixH        equ    160         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    168         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    176         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    184
-.dyO        equ    188
-.dzO        equ    192
-.dxH        equ    196         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    204         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    212         ; repeated (64bit) to fill 3dnow reg
-.tmprsqH     equ    220                ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 228            ;   local stack space
-       femms
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0]
-       pfmul mm2, mm1          
-       movq  [esp + .iqO], mm2         ;  iqO = facel*charge[ii]
-       
-       movd  mm2, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1]
-       pfmul mm2, mm1
-       punpckldq mm2,mm2               ;  spread to both halves
-       movq  [esp + .iqH], mm2         ;  iqH = facel*charge[ii0+1]
-
-       mov   edx, [ebp + %$type]       
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1                    
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       mov   [esp + .ntia], ecx
-               
-       movq  mm3, [mm_two]
-       movq  mm4, [ebp + %$tabscale]
-       punpckldq mm4,mm4               ;  spread to both halves
-       movq  [esp + .two],    mm3
-       movq  [esp + .tabscale], mm4          
-       ;; assume we have at least one i particle - start directly       
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-                                       
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fixO],   mm7
-       movd  [esp + .fizO],   mm7
-       movq  [esp + .fixH],   mm7
-       movq  [esp + .fiyH],   mm7
-       movq  [esp + .fizH],   mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:   
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-       ;prefetch [ecx + 16]       ; prefetch data - trial and error says 16 is best
-
-       mov ecx, [ebp + %$charge]
-       movd mm7, [ecx + eax*4]
-       punpckldq mm7,mm7
-       movq mm6,mm7
-       pfmul mm6, [esp + .iqO]
-       pfmul mm7, [esp + .iqH] ;  mm6=qqO, mm7=qqH
-       movd [esp + .qqO], mm6
-       movq [esp + .qqH], mm7
-
-       mov ecx, [ebp + %$type]
-       mov edx, [ecx + eax*4]           ; type [jnr]
-       mov ecx, [ebp + %$nbfp]
-       shl edx, 1
-       add edx, [esp + .ntia]           ; tja = ntia + 2*type
-       movd mm5, [ecx + edx*4]         ; mm5 = 1st c6          
-       movq [esp + .c6], mm5
-       movd mm5, [ecx + edx*4 + 4]     ; mm5 = 1st c12                 
-       movq [esp + .c12], mm5  
-                       
-       lea   eax, [eax + eax*2]
-       
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-       
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-
-       pfmul mm0, mm1          ;  mm0=r
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqO] ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqO] ; fijC=qq*FF
-
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       movq mm3, mm7
-       
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       pfadd mm3, mm7  ; add to fscal  
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 32]
-       movd mm5, [edx + ecx*4 + 36]
-       movd mm6, [edx + ecx*4 + 40]
-       movd mm7, [edx + ecx*4 + 44]
-
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of fscal and multiply with rinv
-        pxor mm0,mm0
-       pfsubr mm3, mm0 
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-       
-       ;; update vnbtot 
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-       
-       ;; Ready with the oxygen - potential is updated, fscal is in mm3.
-       ;; now do the two hydrogens.
-       movq mm0, [esp + .tmprsqH] ; mm0=rsqH
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqH] ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqH] ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       prefetchw [edi + eax*4] ; prefetch faction to cache
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-       
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-       
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  same for Vnb.
-       
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnb[gid]
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 228
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-       
-
-proc inl3330_3dnow
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-                       ;; stack offsets for local variables
-.is3         equ     0
-.ii3         equ     4
-.ixO         equ     8
-.iyO         equ    12
-.izO         equ    16 
-.ixH         equ    20          ; repeated (64bit) to fill 3dnow reg
-.iyH         equ    28          ; repeated (64bit) to fill 3dnow reg
-.izH         equ    36          ; repeated (64bit) to fill 3dnow reg
-.qqOO        equ    44         ; repeated (64bit) to fill 3dnow reg
-.qqOH        equ    52         ; repeated (64bit) to fill 3dnow reg
-.qqHH        equ    60         ; repeated (64bit) to fill 3dnow reg
-.c6          equ    68         ; repeated (64bit) to fill 3dnow reg
-.c12         equ    76         ; repeated (64bit) to fill 3dnow reg
-.two         equ    84         ; repeated (64bit) to fill 3dnow reg
-.n1         equ    92          ; repeated (64bit) to fill 3dnow reg
-.tabscale    equ    100        ; repeated (64bit) to fill 3dnow reg
-.vctot       equ    108         ; repeated (64bit) to fill 3dnow reg
-.vnbtot      equ    116         ; repeated (64bit) to fill 3dnow reg
-.innerjjnr   equ    124
-.innerk      equ    128        
-.fixO        equ    132
-.fiyO        equ    136
-.fizO        equ    140
-.fixH        equ    144         ; repeated (64bit) to fill 3dnow reg
-.fiyH        equ    152         ; repeated (64bit) to fill 3dnow reg
-.fizH        equ    160         ; repeated (64bit) to fill 3dnow reg
-.dxO        equ    168
-.dyO        equ    172
-.dzO        equ    176
-.dxH        equ    180         ; repeated (64bit) to fill 3dnow reg
-.dyH        equ    188         ; repeated (64bit) to fill 3dnow reg
-.dzH        equ    196         ; repeated (64bit) to fill 3dnow reg
-.tmprsqH     equ    204         ; repeated (64bit) to fill 3dnow reg
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 212            ;   local stack space
-       femms
-       ;; assume we have at least one i particle - start directly      
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movd  mm1, [ebp + %$facel]      ;  mm1=facel
-       movd  mm2, [edx + ebx*4]        ;  mm2=charge[ii0] (O)
-       movd  mm3, [edx + ebx*4 + 4]    ;  mm2=charge[ii0+1] (H)
-       movq  mm4, mm2  
-       pfmul mm4, mm1
-       movq  mm6, mm3
-       pfmul mm6, mm1
-       movq  mm5, mm4
-       pfmul mm4, mm2                  ; mm4=qqOO*facel
-       pfmul mm5, mm3                  ; mm5=qqOH*facel
-       pfmul mm6, mm3                  ; mm6=qqHH*facel
-       punpckldq mm5,mm5               ;  spread to both halves
-       punpckldq mm6,mm6               ;  spread to both halves
-       movq  [esp + .qqOO], mm4
-       movq  [esp + .qqOH], mm5
-       movq  [esp + .qqHH], mm6
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       mov   edx, ecx
-       imul  ecx, [ebp + %$ntype]
-       add   edx, ecx
-       mov   eax, [ebp + %$nbfp]
-       movd  mm0, [eax + edx*4]
-       movd  mm1, [eax + edx*4 + 4]
-       movq  [esp + .c6], mm0
-       movq  [esp + .c12], mm1
-       movq  mm2, [mm_two]
-       movq  [esp + .two], mm2
-       movd  mm3, [ebp + %$tabscale]
-       punpckldq mm3,mm3
-       movq  [esp + .tabscale], mm3
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-       
-       movq  mm5, [eax + ebx*4]        ;  move shX/shY to mm5 and shZ to mm6.
-       movd  mm6, [eax + ebx*4 + 8]
-       movq  mm0, mm5
-       movq  mm1, mm5
-       movq  mm2, mm6
-       punpckldq mm0,mm0               ; also expand shX,Y,Z in mm0--mm2.
-       punpckhdq mm1,mm1
-       punpckldq mm2,mm2               
-       
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       pfadd mm5, [eax + ebx*4]        ; ix = shX + posX (and iy too)
-       movd  mm7, [eax + ebx*4 + 8]    ; cant use direct memory add for 4 bytes (iz)
-       mov   [esp + .ii3], ebx         ; (use mm7 as temp. storage for iz.)
-       pfadd mm6, mm7
-       movq  [esp + .ixO], mm5 
-       movq  [esp + .izO], mm6
-
-       movd  mm3, [eax + ebx*4 + 12]
-       movd  mm4, [eax + ebx*4 + 16]
-       movd  mm5, [eax + ebx*4 + 20]
-       punpckldq  mm3, [eax + ebx*4 + 24]
-       punpckldq  mm4, [eax + ebx*4 + 28]
-       punpckldq  mm5, [eax + ebx*4 + 32] ; coords of H1 in low mm3-mm5, H2 in high.
-       
-       pfadd mm0, mm3
-       pfadd mm1, mm4
-       pfadd mm2, mm5          
-       movq [esp + .ixH], mm0  
-       movq [esp + .iyH], mm1  
-       movq [esp + .izH], mm2  
-
-       ;; clear vctot and i forces.
-       pxor  mm7,mm7
-       movq  [esp + .vctot], mm7
-       movq  [esp + .vnbtot], mm7
-       movq  [esp + .fixO],  mm7
-       movq  [esp + .fizO],  mm7
-       movq  [esp + .fixH],  mm7
-       movq  [esp + .fiyH],  mm7
-       movq  [esp + .fizH],  mm7
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-.inner_loop:
-       ;; a single j particle iteration here - compare with the unrolled code for comments.
-       mov   eax, [esp + .innerjjnr]
-       mov   eax, [eax]        ;  eax=jnr offset
-        add   [esp + .innerjjnr], dword 4 ; advance pointer 
-
-       lea   eax, [eax + eax*2]
-
-       movq  mm0, [esi + eax*4]
-       movd  mm1, [esi + eax*4 + 8]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm0
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt OO
-       pfmul mm0, mm1          ;  mm0=rsq OO
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOO]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOO]        ; fijC=qq*FF
-
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       movq mm3, mm7
-
-       ; dispersion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 16]
-       movd mm5, [edx + ecx*4 + 20]
-       movd mm6, [edx + ecx*4 + 24]
-       movd mm7, [edx + ecx*4 + 28]
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV       
-
-       movq mm4, [esp + .c6]
-       pfmul mm7, mm4  ; fijD
-       pfmul mm5, mm4  ; vnb6           
-       pfadd mm3, mm7  ; add to fscal  
-
-       ;; update vnbtot to release mm5!
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-
-       ; repulsion table
-       ; load all the table values we need
-       movd mm4, [edx + ecx*4 + 32]
-       movd mm5, [edx + ecx*4 + 36]
-       movd mm6, [edx + ecx*4 + 40]
-       movd mm7, [edx + ecx*4 + 44]
-
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       movq mm6, [esp + .c12]
-       pfmul mm7, mm6  ; fijR
-       pfmul mm5, mm6  ; vnb12
-       pfadd mm3, mm7  ; total fscal fijC+fijD+fijR
-
-       ; change sign of fscal and multiply with rinv
-        pxor mm0,mm0
-       pfsubr mm3, mm0 
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-       
-       ;; update vnbtot 
-       pfadd mm5, [esp + .vnbtot]      ; add the earlier value
-       movq [esp + .vnbtot], mm5       ;  store the sum       
-       
-       ;; Ready with the oxygen - potential is updated, fscal is in mm3.
-       ;; time for hydrogens!
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-       
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4]
-       movd mm3,  [edi + eax*4 + 8]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4], mm2
-       movd [edi + eax*4 +8], mm3
-
-       ; interactions with j H1.
-
-       movq  mm0, [esi + eax*4 + 12]
-       movd  mm1, [esi + eax*4 + 20]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-       
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       pfmul mm0, mm1          ;  mm0=rsq 
-       
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-
-       ;; update vctot directly, force is moved to mm3.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       pxor mm3, mm3
-       pfsub mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqHH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqHH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now              
-
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4
-       
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 12]
-       movd mm3,  [edi + eax*4 + 20]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 12], mm2
-       movd [edi + eax*4 + 20], mm3
-
-       ; interactions with j H2
-       movq  mm0, [esi + eax*4 + 24]
-       movd  mm1, [esi + eax*4 + 32]
-       ;;  copy & expand to mm2-mm4 for the H interactions
-       movq  mm2, mm0
-       movq  mm3, mm0
-       movq  mm4, mm1
-       punpckldq mm2,mm2
-       punpckhdq mm3,mm3
-       punpckldq mm4,mm4
-
-       pfsubr mm0, [esp + .ixO]
-       pfsubr mm1, [esp + .izO]
-               
-       movq  [esp + .dxO], mm0
-       pfmul mm0,mm0
-       movd  [esp + .dzO], mm1 
-       pfmul mm1,mm1
-       pfacc mm0, mm1
-       pfadd mm0, mm1          ;  mm0=rsqO
-       
-       punpckldq mm2, mm2
-       punpckldq mm3, mm3
-       punpckldq mm4, mm4  ; mm2-mm4 is jx-jz
-       pfsubr mm2, [esp + .ixH]
-       pfsubr mm3, [esp + .iyH]
-       pfsubr mm4, [esp + .izH] ;  mm2-mm4 is dxH-dzH
-       
-       movq [esp + .dxH], mm2
-       movq [esp + .dyH], mm3
-       movq [esp + .dzH], mm4
-       pfmul mm2,mm2
-       pfmul mm3,mm3
-       pfmul mm4,mm4
-
-       pfadd mm3,mm2
-       pfadd mm3,mm4           ;  mm3=rsqH
-       movq [esp + .tmprsqH], mm3
-
-        pfrsqrt mm1,mm0
-
-        movq mm2,mm1
-        pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       pfmul mm0, mm1
-
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movd [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqOH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqOH]        ; fijC=qq*FF
-
-       ;; update vctot directly, use mm3 for fscal sum.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       pxor mm3,mm3
-       pfsub mm3, mm7
-       pfmul mm3, [esp + .tabscale]
-       pfmul mm3, mm1        ; mm3 is total fscal (for the oxygen) now 
-
-       movq mm0, [esp + .tmprsqH]
-
-       pfrsqrt mm1, mm0
-       pswapd mm0,mm0
-       pfrsqrt mm2, mm0
-       pswapd mm0,mm0
-       punpckldq mm1,mm2       ;  seeds are in mm1 now, and rsq in mm0.
-
-       movq mm2, mm1
-       pfmul mm1,mm1
-        pfrsqit1 mm1,mm0                               
-        pfrcpit2 mm1,mm2       ;  mm1=invsqrt
-       
-       pfmul mm0,mm1           ; mm0=r
-       pfmul mm0, [esp + .tabscale]
-       pf2iw mm4, mm0
-       movq [esp + .n1], mm4
-       pi2fd mm4,mm4
-       pfsub mm0, mm4                   ; now mm0 is eps and mm4 n0.
-       movq  mm2, mm0
-       pfmul mm2, mm2          ;  mm0 is eps, mm2 eps2
-       
-       ; coulomb table
-       mov edx, [ebp + %$VFtab]
-       mov ecx, [esp + .n1]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       ;;  load all values we need
-       movd mm4, [edx + ecx*4]
-       movd mm5, [edx + ecx*4 + 4]
-       movd mm6, [edx + ecx*4 + 8]
-       movd mm7, [edx + ecx*4 + 12]
-       mov ecx, [esp + .n1 + 4]
-       lea ecx, [ecx + ecx*2]
-       shl ecx, 2
-       punpckldq mm4, [edx + ecx*4]
-       punpckldq mm5, [edx + ecx*4 + 4]
-       punpckldq mm6, [edx + ecx*4 + 8]
-       punpckldq mm7, [edx + ecx*4 + 12]
-
-       
-       pfmul mm6, mm0  ; mm6 = Geps            
-       pfmul mm7, mm2  ; mm7 = Heps2
-       ;; 
-       pfadd mm5, mm6
-       pfadd mm5, mm7  ; mm5 = Fp
-
-       pfmul mm7, [esp + .two] ; two*Heps2
-       pfadd mm7, mm6
-       pfadd mm7, mm5  ; mm7=FF
-
-       pfmul mm5, mm0  ; mm5=eps*Fp
-       pfadd mm5, mm4  ; mm5= VV
-
-       pfmul mm5, [esp + .qqHH]        ; vcoul=qq*VV
-       pfmul mm7, [esp + .qqHH]        ; fijC=qq*FF
-       ;;  update vctot.
-       pfadd mm5, [esp + .vctot]
-       movq [esp + .vctot], mm5
-       
-       ;; change sign of fijC and multiply by rinv
-        pxor mm4,mm4
-       pfsub mm4, mm7  
-       pfmul mm4, [esp + .tabscale]
-       pfmul mm4, mm1        ; mm4 is total fscal (for the hydrogens) now      
-
-       ;;  spread oxygen fscalar to both positions
-       punpckldq mm3,mm3
-       ;;  calc vectorial force for O
-       movq mm0,  [esp + .dxO]
-       movd mm1,  [esp + .dzO]
-       pfmul mm0, mm3
-       pfmul mm1, mm3
-
-       ;;  calc vectorial force for H's
-       movq mm5, [esp + .dxH]
-       movq mm6, [esp + .dyH]
-       movq mm7, [esp + .dzH]
-       pfmul mm5, mm4
-       pfmul mm6, mm4
-       pfmul mm7, mm4
-       
-       ;; update iO particle force
-       movq mm2,  [esp + .fixO]
-       movd mm3,  [esp + .fizO]
-       pfadd mm2, mm0
-       pfadd mm3, mm1
-       movq [esp + .fixO], mm2
-       movd [esp + .fizO], mm3
-
-       ;; update iH forces 
-       movq mm2, [esp + .fixH]
-       movq mm3, [esp + .fiyH]
-       movq mm4, [esp + .fizH]
-       pfadd mm2, mm5
-       pfadd mm3, mm6
-       pfadd mm4, mm7
-       movq [esp + .fixH], mm2
-       movq [esp + .fiyH], mm3
-       movq [esp + .fizH], mm4 
-
-       ;; pack j forces from H in the same form as the oxygen force.
-       pfacc mm5, mm6          ; mm5(l)=fjx(H1+H2) mm5(h)=fjy(H1+H2)
-       pfacc mm7, mm7          ; mm7(l)=fjz(H1+H2) 
-       
-       pfadd mm0, mm5          ;  add up total force on j particle.
-       pfadd mm1, mm7
-
-       ;; update j particle force
-       movq mm2,  [edi + eax*4 + 24]
-       movd mm3,  [edi + eax*4 + 32]
-       pfsub mm2, mm0
-       pfsub mm3, mm1
-       movq [edi + eax*4 + 24], mm2
-       movd [edi + eax*4 + 32], mm3
-       
-       ;;  done  - one more?
-       dec dword [esp + .innerk]
-       jz  .updateouterdata
-       jmp .inner_loop 
-.updateouterdata:      
-       mov   ecx, [esp + .ii3]
-
-       movq  mm6, [edi + ecx*4]       ;  increment iO force 
-       movd  mm7, [edi + ecx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       movq  [edi + ecx*4],    mm6
-       movd  [edi + ecx*4 +8], mm7
-
-       movq  mm0, [esp + .fixH]
-       movq  mm3, [esp + .fiyH]
-       movq  mm1, [esp + .fizH]
-       movq  mm2, mm0
-       punpckldq mm0, mm3      ;  mm0(l)=fxH1, mm0(h)=fyH1
-       punpckhdq mm2, mm3      ;  mm2(l)=fxH2, mm2(h)=fyH2
-       movq mm3, mm1
-       pswapd mm3,mm3          
-       ;;  mm1 is fzH1
-       ;;  mm3 is fzH2
-
-       movq  mm6, [edi + ecx*4 + 12]       ;  increment iH1 force
-       movd  mm7, [edi + ecx*4 + 20]   
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       movq  [edi + ecx*4 + 12],  mm6
-       movd  [edi + ecx*4 + 20],  mm7
-       
-       movq  mm6, [edi + ecx*4 + 24]       ;  increment iH2 force
-       movd  mm7, [edi + ecx*4 + 32]   
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [edi + ecx*4 + 24],  mm6
-       movd  [edi + ecx*4 + 32],  mm7
-
-       
-       mov   ebx, [ebp + %$fshift]    ; increment fshift force
-       mov   edx, [esp + .is3]
-
-       movq  mm6, [ebx + edx*4]        
-       movd  mm7, [ebx + edx*4 + 8]    
-       pfadd mm6, [esp + .fixO]
-       pfadd mm7, [esp + .fizO]
-       pfadd mm6, mm0
-       pfadd mm7, mm1
-       pfadd mm6, mm2
-       pfadd mm7, mm3
-       movq  [ebx + edx*4],     mm6
-       movd  [ebx + edx*4 + 8], mm7
-       
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       movq  mm7, [esp + .vctot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vc]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vc[gid]
-
-       movq  mm7, [esp + .vnbtot]     
-       pfacc mm7,mm7                 ;  get and sum the two parts of total potential
-
-       mov   eax, [ebp + %$Vnb]
-       movd  mm6, [eax + edx*4] 
-       pfadd mm6, mm7
-       movd  [eax + edx*4], mm6              ; increment vnbtot[gid]
-       ;; finish if last
-       dec dword [ebp + %$nri]
-       jz  .end
-       ;;  not last, iterate once more!
-       jmp .outer
-.end:
-       femms
-       add esp, 212
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
- 
diff --git a/src/gmxlib/x86_cpuid.S b/src/gmxlib/x86_cpuid.S

new file mode 100644 (file)

index 0000000..6dfb03a
--- /dev/null
+++ b/src/gmxlib/x86_cpuid.S
@@ -0,0 +1,77 @@
+/*
+ *                 This source code is part of
+ *
+ *                  G   R   O   M   A   C   S
+ *
+ *           GROningen MAchine for Chemical Simulations
+ *
+ *                         VERSION 3.0
+ *
+ *  Copyright (c) 1991-2001
+ *  BIOSON Research Institute, Dept. of Biophysical Chemistry
+ *  University of Groningen, The Netherlands
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  If you want to redistribute modifications, please consider that
+ *  scientific software is very special. Version control is crucial -
+ *  bugs must be traceable. We will be happy to consider code for
+ *  inclusion in the official distribution, but derived work must not
+ *  be called official GROMACS. Details are found in the README & COPYING
+ *  files - if they are missing, get the official version at www.gromacs.org.
+ *
+ *  To help us fund GROMACS development, we humbly ask that you cite
+ *  the papers on the package - you can find them in the top README file.
+ *
+ *  Do check out http: //www.gromacs.org , or mail us at gromacs@gromacs.org .
+ *
+ *  And Hey:
+ *  GROup of MAchos and Cynical Suckers
+ *
+ * This file requires GNU binutils 2.10 or later, since we
+ * use intel syntax for portability.           
+ */
+
+.intel_syntax noprefix
+               
+.text
+       
+.globl x86_cpuid       /* issues the cpuid instruction with supplied args */
+       .type x86_cpuid,@function
+x86_cpuid:     
+       push ebp
+       mov  ebp,esp
+       push edi
+       push ebx
+       push ecx
+       push edx
+       mov  eax, [ebp+8]       
+       cpuid
+       mov  edi, [ebp+12]
+       mov  [edi],eax
+       mov  edi, [ebp+16]
+       mov  [edi],ebx
+       mov  edi, [ebp+20]
+       mov  [edi],ecx
+       mov  edi, [ebp+24]
+       mov  [edi],edx
+       pop edx
+       pop ecx
+       pop ebx
+       pop edi
+       mov esp, ebp
+       pop ebp
+       ret
+
+
+
+
+
+
+
+
+
+
diff --git a/src/gmxlib/x86_cpuid.asm b/src/gmxlib/x86_cpuid.asm

deleted file mode 100644 (file)

index 4297f1c..0000000
--- a/src/gmxlib/x86_cpuid.asm
+++ /dev/null
@@ -1,64 +0,0 @@
-;;
-;;                 This source code is part of
-;;
-;;                  G   R   O   M   A   C   S
-;;
-;;           GROningen MAchine for Chemical Simulations
-;;
-;;                         VERSION 3.0
-;;
-;;  Copyright (c) 1991-2001
-;;  BIOSON Research Institute, Dept. of Biophysical Chemistry
-;;  University of Groningen, The Netherlands
-;;
-;;  This program is free software; you can redistribute it and/or
-;;  modify it under the terms of the GNU General Public License
-;;  as published by the Free Software Foundation; either version 2
-;;  of the License, or (at your option) any later version.
-;;
-;;  If you want to redistribute modifications, please consider that
-;;  scientific software is very special. Version control is crucial -
-;;  bugs must be traceable. We will be happy to consider code for
-;;  inclusion in the official distribution, but derived work must not
-;;  be called official GROMACS. Details are found in the README & COPYING
-;;  files - if they are missing, get the official version at www.gromacs.org.
-;;
-;;  To help us fund GROMACS development, we humbly ask that you cite
-;;  the papers on the package - you can find them in the top README file.
-;;
-;;  Do check out http: //www.gromacs.org , or mail us at gromacs@gromacs.org .
-;;
-;;  And Hey:
-;;  GROup of MAchos and Cynical Suckers
-
-;; this file must be processed with a version
-;; of nasm that supports the extended 3dnow instructions.
-;; you can find a binary of such a version on the
-;; gromacs homepage.
-
-segment .text
-global x86_cpuid               ;  issues the cpuid instruction with supplied args
-x86_cpuid:     
-       push ebp
-       mov  ebp,esp
-       push edi
-       push ebx
-       push ecx
-       push edx
-       mov  eax, [ebp+8]       
-       cpuid
-       mov  edi, [ebp+12]
-       mov  [edi],eax
-       mov  edi, [ebp+16]
-       mov  [edi],ebx
-       mov  edi, [ebp+20]
-       mov  [edi],ecx
-       mov  edi, [ebp+24]
-       mov  [edi],edx
-       pop edx
-       pop ecx
-       pop ebx
-       pop edi
-       mov esp, ebp
-       pop ebp
-       ret
diff --git a/src/gmxlib/x86_sse.S b/src/gmxlib/x86_sse.S

new file mode 100644 (file)

index 0000000..3feac18
--- /dev/null
+++ b/src/gmxlib/x86_sse.S
@@ -0,0 +1,37757 @@
+/*
+ *                This source code is part of
+ *                             
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ *                        VERSION 3.0
+ *
+ * Copyright (c) 1991-2001
+ * Dept. of Biophysical Chemistry
+ * University of Groningen, The Netherlands
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org .
+ *
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */
+.intel_syntax noprefix
+       
+.text
+.align 16
+       
+sse_minushalf: 
+       .long 0xbf000000
+       .long 0xbf000000
+       .long 0xbf000000
+       .long 0xbf000000
+sse_half:      
+       .long 0x3f000000
+       .long 0x3f000000
+       .long 0x3f000000
+       .long 0x3f000000
+sse_two:       
+       .long 0x40000000
+       .long 0x40000000
+       .long 0x40000000
+       .long 0x40000000
+sse_three:     
+       .long 0x40400000
+       .long 0x40400000
+       .long 0x40400000
+       .long 0x40400000
+sse_six:
+       .long 0x40c00000
+       .long 0x40c00000
+       .long 0x40c00000
+       .long 0x40c00000
+sse_twelve:    
+       .long 0x41400000
+       .long 0x41400000
+       .long 0x41400000
+       .long 0x41400000
+
+
+
+.globl checksse         /* try to issue a SSE instruction */
+       .type checksse,@function
+checksse:
+       emms
+       xorps xmm0,xmm0
+       emms
+       ret
+
+.align 16
+       
+.globl vecinvsqrt_sse
+       .type vecinvsqrt_sse,@function
+vecinvsqrt_sse:        
+       push ebp
+       mov ebp,esp     
+       push eax
+       push ebx
+       push ecx
+       push edx
+
+       mov eax, [ebp + 8]
+       mov ebx, [ebp + 12]     
+       mov ecx, [ebp + 16]
+        mov edx, ecx
+       movups xmm6,[sse_three]
+       movups xmm7,[sse_half]
+        shr ecx, 3
+        jecxz .vecinvsqrt_iter4
+        emms   
+.vecinvsqrt_loop8:     
+       movaps xmm0,[eax]
+       add eax,  16
+       rsqrtps xmm1,xmm0
+       movaps xmm2,[eax]
+       add eax,  16
+       rsqrtps xmm3,xmm2
+       mulps xmm0,xmm1
+        mulps xmm2,xmm3
+       mulps xmm0,xmm1
+        mulps xmm2,xmm3
+       subps xmm0,xmm6
+       subps xmm2,xmm6
+       mulps xmm0,xmm1
+       mulps xmm2,xmm3
+       mulps xmm0,xmm7
+       mulps xmm2,xmm7
+       movaps [ebx],xmm0
+       add ebx,  16
+       movaps [ebx],xmm2
+       add ebx,  16
+        dec ecx
+        jecxz .vecinvsqrt_iter4
+        jmp .vecinvsqrt_loop8
+.vecinvsqrt_iter4:
+        mov ecx,edx
+        and ecx,4
+        jecxz .vecinvsqrt_iter2
+       movaps xmm0,[eax]
+       add eax,  16
+       rsqrtps xmm1,xmm0
+       mulps xmm0,xmm1
+       mulps xmm0,xmm1
+       subps xmm0,xmm6
+       mulps xmm0,xmm1
+       mulps xmm0,xmm7
+       movaps [ebx],xmm0
+       add ebx,  16        
+.vecinvsqrt_iter2:
+        mov ecx,edx
+        and ecx,2
+        jecxz .vecinvsqrt_iter1
+       movlps xmm0,[eax]
+       add eax,  8
+       rsqrtps xmm1,xmm0
+       mulps xmm0,xmm1
+       mulps xmm0,xmm1
+       subps xmm0,xmm6
+       mulps xmm0,xmm1
+       mulps xmm0,xmm7
+       movlps [ebx],xmm0
+       add ebx,  8     
+.vecinvsqrt_iter1:
+        mov ecx,edx
+        and ecx,1
+        jecxz .vecinvsqrt_end
+       movss xmm0,[eax]
+       rsqrtss xmm1,xmm0
+       mulss xmm0,xmm1
+       mulss xmm0,xmm1
+       subss xmm0,xmm6
+       mulss xmm0,xmm1
+       mulss xmm0,xmm7
+       movss [ebx],xmm0        
+.vecinvsqrt_end:       
+       emms
+       pop edx
+       pop ecx
+       pop ebx
+       pop eax
+       leave
+       ret
+       
+.globl vecrecip_sse
+       .type vecrecip_sse,@function
+vecrecip_sse:  
+       push ebp
+       mov ebp,esp     
+       push eax
+       push ebx
+       push ecx
+       push edx
+
+       mov eax, [ebp + 8]
+       mov ebx, [ebp + 12]     
+       mov ecx, [ebp + 16]
+        mov edx, ecx
+       movups xmm6,[sse_two]
+        shr ecx, 3
+        jecxz .vecrecip_iter4
+        emms   
+.vecrecip_loop8:       
+       movaps xmm0,[eax]
+       add eax,  16
+       rcpps xmm1,xmm0
+       movaps xmm3,[eax]
+       add eax,  16
+       rcpps xmm4,xmm3
+       movaps xmm2,xmm6
+       mulps xmm0,xmm1
+       movaps xmm5,xmm6        
+       subps xmm2,xmm0
+       mulps xmm3,xmm4
+       mulps xmm2,xmm1 
+       subps xmm5,xmm3 
+       movaps [ebx],xmm2
+       mulps xmm5,xmm4
+       add ebx,  16
+       movaps [ebx],xmm5
+       add ebx,  16
+        dec ecx
+        jecxz .vecrecip_iter4
+        jmp .vecrecip_loop8
+.vecrecip_iter4:
+        mov ecx,edx
+        and ecx,4
+        jecxz .vecrecip_iter2
+       movaps xmm0,[eax]
+       add eax,  16
+       rcpps xmm1,xmm0
+       movaps xmm2,xmm6
+       mulps xmm0,xmm1         
+       subps xmm2,xmm0
+       mulps xmm2,xmm1
+       movaps [ebx],xmm2
+       add ebx,  16        
+.vecrecip_iter2:
+        mov ecx,edx
+        and ecx,2
+        jecxz .vecrecip_iter1
+       movlps xmm0,[eax]
+       add eax,  8
+       rcpps xmm1,xmm0
+       movaps xmm2,xmm6
+       mulps xmm0,xmm1         
+       subps xmm2,xmm0
+       mulps xmm2,xmm1
+       movlps [ebx],xmm2
+       add ebx,  8     
+.vecrecip_iter1:
+        mov ecx,edx
+        and ecx,1
+        jecxz .vecrecip_end
+       movss xmm0,[eax]
+       rcpss xmm1,xmm0
+       movss xmm2,xmm6
+       mulss xmm0,xmm1         
+       subss xmm2,xmm0
+       mulss xmm2,xmm1
+       movss [ebx],xmm2        
+.vecrecip_end: 
+       emms
+       pop edx
+       pop ecx
+       pop ebx
+       pop eax
+       leave
+       ret
+       
+       
+.globl inl0100_sse
+       .type inl0100_sse,@function
+inl0100_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56      
+.equ           Vnb,            60      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,             0
+.equ           iy,             16
+.equ           iz,             32
+.equ           dx,             48
+.equ           dy,             64
+.equ           dz,             80
+.equ           two,            96              
+.equ           c6,             112
+.equ           c12,            128
+.equ           six,            144
+.equ           twelve,         160              
+.equ           vnbtot,         176
+.equ           fix,            192
+.equ           fiy,            208
+.equ           fiz,            224
+.equ           half,           240
+.equ           three,          256
+.equ           is3,            272
+.equ           ii3,            276
+.equ           ntia,           280     
+.equ           innerjjnr,      284
+.equ           innerk,         288
+.equ           salign,         292                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 296            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movaps [esp + two], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+.i0100_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vnbtot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0100_unroll_loop
+       jmp   .i0100_finish_inner
+.i0100_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm2
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i0100_finish_inner
+       jmp   .i0100_unroll_loop
+.i0100_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i0100_dopair
+       jmp   .i0100_checksingle
+.i0100_dopair: 
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       xorps  xmm7,xmm7
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i0100_checksingle:                            
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i0100_dosingle
+       jmp    .i0100_updateouterdata
+.i0100_dosingle:                       
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]                
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       xorps  xmm6, xmm6
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i0100_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6       
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0100_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i0100_outer
+.i0100_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 296
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+.globl inl0110_sse
+       .type inl0110_sse,@function
+inl0110_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56      
+.equ           Vnb,            60      
+.equ           nsatoms,        64                      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,              0
+.equ           iy,             16
+.equ           iz,             32
+.equ           dx,             48
+.equ           dy,             64
+.equ           dz,             80
+.equ           two,            96              
+.equ           c6,            112
+.equ           c12,           128
+.equ           six,           144
+.equ           twelve,        160               
+.equ           vnbtot,        176
+.equ           fix,           192
+.equ           fiy,           208
+.equ           fiz,           224
+.equ           half,          240
+.equ           three,         256
+.equ           is3,           272
+.equ           ii3,           276
+.equ           shX,           280
+.equ           shY,           284
+.equ           shZ,           288
+.equ           ntia,          292      
+.equ           innerjjnr0,    296
+.equ           innerjjnr,     300
+.equ           innerk0,       304
+.equ           innerk,        308
+.equ           salign,        312                                              
+.equ           nsvdwc,        316
+.equ           nscoul,        320
+.equ           nsvdw,         324
+.equ           solnr,         328              
+       push ebp
+       mov ebp,esp             
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 332            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movaps [esp + two], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+.i0110_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movlps xmm0, [eax + ebx*4]      /* getting the shiftvector */
+       movss xmm1, [eax + ebx*4 + 8] 
+       movlps [esp + shX], xmm0
+       movss [esp + shZ], xmm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+
+       /* clear vnbtot */
+       xorps xmm4, xmm4
+       movaps [esp + vnbtot], xmm4
+       mov   [esp + solnr],  ebx
+               
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i0110_mno_vdwc
+       jmp   .i0110_testvdw
+.i0110_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0110_unroll_vdwc_loop
+       jmp   .i0110_finish_vdwc_inner
+.i0110_unroll_vdwc_loop:       
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm2
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i0110_finish_vdwc_inner
+       jmp   .i0110_unroll_vdwc_loop
+.i0110_finish_vdwc_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i0110_dopair_vdwc
+       jmp   .i0110_checksingle_vdwc
+.i0110_dopair_vdwc:    
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       xorps  xmm7,xmm7
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i0110_checksingle_vdwc:                               
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i0110_dosingle_vdwc
+       jmp    .i0110_updateouterdata_vdwc
+.i0110_dosingle_vdwc:                  
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]                
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       xorps  xmm6, xmm6
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i0110_updateouterdata_vdwc:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdwc]
+       jz  .i0110_testvdw
+       jmp .i0110_mno_vdwc
+.i0110_testvdw:        
+       mov  ebx,  [esp + nscoul]
+       add  [esp + solnr],  ebx
+
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i0110_mno_vdw
+       jmp  .i0110_last_mno
+.i0110_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0110_unroll_vdw_loop
+       jmp   .i0110_finish_vdw_inner
+.i0110_unroll_vdw_loop:        
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm2
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i0110_finish_vdw_inner
+       jmp   .i0110_unroll_vdw_loop
+.i0110_finish_vdw_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i0110_dopair_vdw
+       jmp   .i0110_checksingle_vdw
+.i0110_dopair_vdw:     
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       xorps  xmm7,xmm7
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i0110_checksingle_vdw:                                
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz   .i0110_dosingle_vdw
+       jmp   .i0110_updateouterdata_vdw
+.i0110_dosingle_vdw:                   
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]                
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       xorps  xmm6, xmm6
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i0110_updateouterdata_vdw:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+       
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i0110_last_mno
+       jmp .i0110_mno_vdw
+       
+.i0110_last_mno:       
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0110_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i0110_outer
+.i0110_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 332
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+.globl inl0300_sse
+       .type inl0300_sse,@function
+inl0300_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56      
+.equ           Vnb,            60
+.equ           tabscale,       64
+.equ           VFtab,          68
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           dx,              48
+.equ           dy,              64
+.equ           dz,              80
+.equ           two,             96
+.equ           tsc,            112
+.equ           c6,             128
+.equ           c12,            144
+.equ           fscal,          160
+.equ           vnbtot,         176
+.equ           fix,            192
+.equ           fiy,            208
+.equ           fiz,            224
+.equ           half,           240
+.equ           three,          256
+.equ           is3,            272
+.equ           ii3,            276
+.equ           ntia,           280     
+.equ           innerjjnr,      284
+.equ           innerk,         288
+.equ           salign,         292                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 296            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three],  xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+.i0300_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear tot potential and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0300_unroll_loop
+       jmp   .i0300_finish_inner
+.i0300_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 3
+       pslld mm7, 3
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       /* dispersion */
+       movlps xmm5, [esi + eax*4 + 0]
+       movlps xmm7, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + ebx*4 + 0]
+       movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + eax*4 + 16]
+       movlps xmm7, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + ebx*4 + 16]
+       movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 24]
+       movlps xmm3, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + ebx*4 + 24]
+       movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 
+       mulps  xmm5, xmm4  
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i0300_finish_inner
+       jmp   .i0300_unroll_loop
+.i0300_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i0300_dopair
+       jmp   .i0300_checksingle
+.i0300_dopair: 
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 3
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       /* dispersion */
+       movlps xmm5, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i0300_checksingle:                            
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i0300_dosingle
+       jmp    .i0300_updateouterdata
+.i0300_dosingle:
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 3
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       /* dispersion */
+       movlps xmm4, [esi + ebx*4 + 0]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm4, [esi + ebx*4 + 16]
+       movlps xmm6, [esi + ebx*4 + 24]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i0300_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0300_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i0300_outer
+.i0300_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 296
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+.globl inl0310_sse
+       .type inl0310_sse,@function
+inl0310_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           type,           48
+.equ           ntype,          52
+.equ           nbfp,           56      
+.equ           Vnb,            60      
+.equ           tabscale,       64
+.equ           VFtab,          68
+.equ           nsatoms,        72
+       /* stack offsets for local variables */ 
+        /* bottom of stack is cache-aligned for sse use */
+.equ           ix,              0
+.equ           iy,             16
+.equ           iz,             32
+.equ           dx,             48
+.equ           dy,             64
+.equ           dz,             80
+.equ           two,            96   
+.equ           tsc,           112
+.equ           c6,            128
+.equ           c12,           144
+.equ           fscal,         160
+.equ           vnbtot,        176
+.equ           fix,           192
+.equ           fiy,           208
+.equ           fiz,           224
+.equ           half,          240
+.equ           three,         256
+.equ           is3,           272
+.equ           ii3,           276
+.equ           shX,           280
+.equ           shY,           284
+.equ           shZ,           288
+.equ           ntia,          292      
+.equ           innerjjnr0,    296
+.equ           innerjjnr,     300
+.equ           innerk0,       304
+.equ           innerk,        308
+.equ           salign,        312                                              
+.equ           nsvdwc,        316
+.equ           nscoul,        320
+.equ           nsvdw,         324
+.equ           solnr,         328
+       push ebp
+       mov ebp,esp     
+        push eax      
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 332            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three], xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+.i0310_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movlps xmm0, [eax + ebx*4]      /* getting the shiftvector */
+       movss xmm1, [eax + ebx*4 + 8] 
+       movlps [esp + shX], xmm0
+       movss [esp + shZ], xmm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+
+       /* clear vnbtot */
+       xorps xmm4, xmm4
+       movaps [esp + vnbtot], xmm4
+       mov   [esp + solnr],  ebx
+               
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i0310_mno_vdwc
+       jmp   .i0310_testvdw
+.i0310_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0310_unroll_vdwc_loop
+       jmp   .i0310_finish_vdwc_inner
+.i0310_unroll_vdwc_loop:       
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 3
+       pslld mm7, 3
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       /* dispersion */
+       movlps xmm5, [esi + eax*4 + 0]
+       movlps xmm7, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + ebx*4 + 0]
+       movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + eax*4 + 16]
+       movlps xmm7, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + ebx*4 + 16]
+       movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 24]
+       movlps xmm3, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + ebx*4 + 24]
+       movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i0310_finish_vdwc_inner
+       jmp   .i0310_unroll_vdwc_loop
+.i0310_finish_vdwc_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i0310_dopair_vdwc
+       jmp   .i0310_checksingle_vdwc
+.i0310_dopair_vdwc:    
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 3
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       /* dispersion */
+       movlps xmm5, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+
+       movlps xmm7, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i0310_checksingle_vdwc:                               
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i0310_dosingle_vdwc
+       jmp    .i0310_updateouterdata_vdwc
+.i0310_dosingle_vdwc:
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 3
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       /* dispersion */
+       movlps xmm4, [esi + ebx*4 + 0]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm4, [esi + ebx*4 + 16]
+       movlps xmm6, [esi + ebx*4 + 24]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i0310_updateouterdata_vdwc:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* loop back to mno */
+       dec  dword ptr [esp + nsvdwc]
+       jz  .i0310_testvdw
+       jmp .i0310_mno_vdwc
+.i0310_testvdw:        
+       mov  ebx,  [esp + nscoul]
+       add  [esp + solnr],  ebx
+
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i0310_mno_vdw
+       jmp  .i0310_last_mno
+.i0310_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i0310_unroll_vdw_loop
+       jmp   .i0310_finish_vdw_inner
+.i0310_unroll_vdw_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 3
+       pslld mm7, 3
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       /* dispersion */
+       movlps xmm5, [esi + eax*4 + 0]
+       movlps xmm7, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + ebx*4 + 0]
+       movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + eax*4 + 16]
+       movlps xmm7, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + ebx*4 + 16]
+       movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 24]
+       movlps xmm3, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + ebx*4 + 24]
+       movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i0310_finish_vdw_inner
+       jmp   .i0310_unroll_vdw_loop
+.i0310_finish_vdw_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i0310_dopair_vdw
+       jmp   .i0310_checksingle_vdw
+.i0310_dopair_vdw:     
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 3
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       /* dispersion */
+       movlps xmm5, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+
+       movlps xmm7, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i0310_checksingle_vdw:                                
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i0310_dosingle_vdw
+       jmp    .i0310_updateouterdata_vdw
+.i0310_dosingle_vdw:
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 3
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       /* dispersion */
+       movlps xmm4, [esi + ebx*4 + 0]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm4, [esi + ebx*4 + 16]
+       movlps xmm6, [esi + ebx*4 + 24]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5         
+.i0310_updateouterdata_vdw:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+       
+       /* loop back to mno */
+       dec  dword ptr [esp + nsvdw]
+       jz  .i0310_last_mno
+       jmp .i0310_mno_vdw      
+.i0310_last_mno:       
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i0310_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i0310_outer
+.i0310_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 332
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl1000_sse
+       .type inl1000_sse,@function
+inl1000_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,              0
+.equ           iy,             16
+.equ           iz,             32
+.equ           iq,             48
+.equ           dx,             64
+.equ           dy,             80
+.equ           dz,             96
+.equ           vctot,         112
+.equ           fix,           128
+.equ           fiy,           144
+.equ           fiz,           160
+.equ           half,          176
+.equ           three,         192
+.equ           is3,           208
+.equ           ii3,           212
+.equ           innerjjnr,     216
+.equ           innerk,        220              
+.equ           salign,        224                                                      
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 228            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+
+       /* assume we have at least one i particle - start directly */   
+i1000_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1000_unroll_loop
+       jmp   i1000_finish_inner
+i1000_unroll_loop:     
+       /* quad-unrolled innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm5, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0
+       shufps xmm3, xmm4, 0b10001000         
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm5
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]      /* x1 y1 - - */
+       movlps xmm5, [esi + ecx*4]      /* x3 y3 - - */
+       movss xmm2, [esi + eax*4 + 8]   /* z1 -  - - */
+       movss xmm6, [esi + ecx*4 + 8]   /* z3 -  - - */
+
+       movhps xmm4, [esi + ebx*4]      /* x1 y1 x2 y2 */
+       movhps xmm5, [esi + edx*4]      /* x3 y3 x4 y4 */
+
+       movss xmm0, [esi + ebx*4 + 8]   /* z2 - - - */
+       movss xmm1, [esi + edx*4 + 8]   /* z4 - - - */
+
+       shufps xmm2, xmm0, 0            /* z1 z1 z2 z2 */
+       shufps xmm6, xmm1, 0            /* z3 z3 z4 z4 */
+       
+       movaps xmm0, xmm4               /* x1 y1 x2 y2 */       
+       movaps xmm1, xmm4               /* x1 y1 x2 y2 */
+
+       shufps xmm2, xmm6, 0b10001000   /* z1 z2 z3 z4 */
+       
+       shufps xmm0, xmm5, 0b10001000   /* x1 x2 x3 x4 */
+       shufps xmm1, xmm5, 0b11011101   /* y1 y2 y3 y4 */               
+
+       mov    edi, [ebp + faction]
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1000_finish_inner
+       jmp   i1000_unroll_loop
+i1000_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   i1000_dopair
+       jmp   i1000_checksingle
+i1000_dopair:  
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       mulps  xmm3, [esp + iq]
+       xorps  xmm7,xmm7
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */ 
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+       
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+i1000_checksingle:                             
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    i1000_dosingle
+       jmp    i1000_updateouterdata
+i1000_dosingle:                        
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       movss xmm3, [esi + eax*4]       /* xmm3(0) has the charge */    
+       
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+ 
+       mulps  xmm3, [esp + iq]
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       mov    edi, [ebp + faction]
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */ 
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+i1000_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec  ecx
+       jecxz i1000_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1000_outer
+i1000_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 228
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl1010_sse
+       .type inl1010_sse,@function
+inl1010_sse:   
+.equ           nri,             8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56
+.equ           nsatoms,        60              
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96
+.equ           vctot,          112
+.equ           fix,            128
+.equ           fiy,            144
+.equ           fiz,            160
+.equ           half,           176
+.equ           three,          192
+.equ           is3,            208
+.equ           ii3,            212
+.equ           shX,            216
+.equ           shY,            220
+.equ           shZ,            224
+.equ           ntia,           228     
+.equ           innerjjnr0,     232
+.equ           innerk0,        236
+.equ           innerjjnr,      240
+.equ           innerk,         244             
+.equ           salign,         248                                             
+.equ           nscoul,         252
+.equ           solnr,          256             
+       push ebp
+       mov ebp,esp             
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 260            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       add   [ebp + nsatoms],  8
+
+       /* assume we have at least one i particle - start directly */   
+i1010_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+       movss [esp + shX], xmm0
+       movss [esp + shY], xmm1
+       movss [esp + shZ], xmm2
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   eax, [ebp + nsatoms]
+       mov   ecx, [eax]
+       add   [ebp + nsatoms],  12
+       mov   [esp + nscoul], ecx       
+
+       /* clear vctot */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       mov   [esp + solnr], ebx
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+
+       mov   ecx, [esp + nscoul]
+       cmp   ecx,  0
+       jnz   i1010_mno_coul
+       jmp   i1010_last_mno
+i1010_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1010_unroll_coul_loop
+       jmp   i1010_finish_coul_inner
+
+i1010_unroll_coul_loop:        
+       /* quad-unrolled innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm5, [esp + iq]
+       shufps xmm3, xmm6, 0
+       shufps xmm4, xmm7, 0
+       shufps xmm3, xmm4, 0b10001000         
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm5
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       mov    edi, [ebp + faction]
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1010_finish_coul_inner
+       jmp   i1010_unroll_coul_loop
+i1010_finish_coul_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   i1010_dopair_coul
+       jmp   i1010_checksingle_coul
+i1010_dopair_coul:     
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       mulps  xmm3, [esp + iq]
+       xorps  xmm7,xmm7
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */ 
+       movss xmm3, [edi + eax*4]
+       movss xmm4, [edi + eax*4 + 4]
+       movss xmm5, [edi + eax*4 + 8]
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       subps  xmm5, xmm2
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+       
+       movss xmm3, [edi + ebx*4]
+       movss xmm4, [edi + ebx*4 + 4]
+       movss xmm5, [edi + ebx*4 + 8]
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       subps  xmm5, xmm2
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5
+
+i1010_checksingle_coul:                                
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    i1010_dosingle_coul
+       jmp    i1010_updateouterdata_coul
+i1010_dosingle_coul:                   
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       movss xmm3, [esi + eax*4]       /* xmm3(0) has the charge */    
+       
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+ 
+       mulps  xmm3, [esp + iq]
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       mov    edi, [ebp + faction]
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */ 
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+        subss   xmm5, xmm2
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+i1010_updateouterdata_coul:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* loop back to mno */
+       dec dword ptr [esp + nscoul]
+       jz  i1010_last_mno
+       jmp i1010_mno_coul
+       
+i1010_last_mno:        
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz i1010_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1010_outer
+i1010_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 260
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl1020_sse
+       .type inl1020_sse,@function
+inl1020_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           iqO,            144 
+.equ           iqH,            160 
+.equ           dxO,            176
+.equ           dyO,            192
+.equ           dzO,            208     
+.equ           dxH1,           224
+.equ           dyH1,           240
+.equ           dzH1,           256     
+.equ           dxH2,           272
+.equ           dyH2,           288
+.equ           dzH2,           304     
+.equ           qqO,            320
+.equ           qqH,            336
+.equ           vctot,          352
+.equ           fixO,           368
+.equ           fiyO,           384
+.equ           fizO,           400
+.equ           fixH1,          416
+.equ           fiyH1,          432
+.equ           fizH1,          448
+.equ           fixH2,          464
+.equ           fiyH2,          480
+.equ           fizH2,          496
+.equ           fjx,            512
+.equ           fjy,            528
+.equ           fjz,            544
+.equ           half,           560
+.equ           three,          576
+.equ           is3,            592
+.equ           ii3,            596
+.equ           innerjjnr,      600
+.equ           innerk,         604
+.equ           salign,         608                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 612            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, [edx + ebx*4 + 4]   
+       movss xmm5, [ebp + facel]
+       mulss  xmm3, xmm5
+       mulss  xmm4, xmm5
+
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       movaps [esp + iqO], xmm3
+       movaps [esp + iqH], xmm4
+       
+i1020_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1020_unroll_loop
+       jmp   i1020_odd_inner
+i1020_unroll_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       shufps xmm3, xmm6, 0
+       shufps xmm4, xmm7, 0
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movaps xmm4, xmm3            /* and in xmm4 */
+       mulps  xmm3, [esp + iqO]
+       mulps  xmm4, [esp + iqH]
+
+       movaps  [esp + qqO], xmm3
+       movaps  [esp + qqH], xmm4       
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ixO-izO to xmm4-xmm6 */
+       movaps xmm4, [esp + ixO]
+       movaps xmm5, [esp + iyO]
+       movaps xmm6, [esp + izO]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxO], xmm4
+       movaps [esp + dyO], xmm5
+       movaps [esp + dzO], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       movaps xmm7, xmm4
+       /* rsqO in xmm7 */
+
+       /* move ixH1-izH1 to xmm4-xmm6 */
+       movaps xmm4, [esp + ixH1]
+       movaps xmm5, [esp + iyH1]
+       movaps xmm6, [esp + izH1]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxH1], xmm4
+       movaps [esp + dyH1], xmm5
+       movaps [esp + dzH1], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm6, xmm5
+       addps xmm6, xmm4
+       /* rsqH1 in xmm6 */
+
+       /* move ixH2-izH2 to xmm3-xmm5 */ 
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+
+       /* calc dr */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+
+       /* store dr */
+       movaps [esp + dxH2], xmm3
+       movaps [esp + dyH2], xmm4
+       movaps [esp + dzH2], xmm5
+       /* square it */
+       mulps xmm3,xmm3
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       addps xmm5, xmm4
+       addps xmm5, xmm3
+       /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+       /* start with rsqO - seed in xmm2 */    
+       rsqrtps xmm2, xmm7
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm7      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm7, xmm4      /* rinvO in xmm7 */
+       /* rsqH1 - seed in xmm2 */
+       rsqrtps xmm2, xmm6
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm6      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm6, xmm4      /* rinvH1 in xmm6 */
+       /* rsqH2 - seed in xmm2 */
+       rsqrtps xmm2, xmm5
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm5      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm5, xmm4      /* rinvH2 in xmm5 */
+
+       /* do O interactions */
+       movaps  xmm4, xmm7      
+       mulps   xmm4, xmm4      /* xmm7=rinv, xmm4=rinvsq */
+       mulps  xmm7, [esp + qqO]        /* xmm7=vcoul */
+       
+       mulps  xmm4, xmm7       /* total fsO in xmm4 */
+
+       addps  xmm7, [esp + vctot]
+       
+       movaps [esp + vctot], xmm7
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update O forces */
+       movaps xmm3, [esp + fixO]
+       movaps xmm4, [esp + fiyO]
+       movaps xmm7, [esp + fizO]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixO], xmm3
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm7
+       /* update j forces with water O */
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H1 interactions */
+       movaps  xmm4, xmm6      
+       mulps   xmm4, xmm4      /* xmm6=rinv, xmm4=rinvsq */
+       mulps  xmm6, [esp + qqH]        /* xmm6=vcoul */
+       mulps  xmm4, xmm6               /* total fsH1 in xmm4 */
+       
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dxH1]
+       movaps xmm1, [esp + dyH1]
+       movaps xmm2, [esp + dzH1]
+       movaps [esp + vctot], xmm6
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H1 forces */
+       movaps xmm3, [esp + fixH1]
+       movaps xmm4, [esp + fiyH1]
+       movaps xmm7, [esp + fizH1]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH1], xmm3
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm7
+       /* update j forces with water H1 */
+       addps  xmm0, [esp + fjx]
+       addps  xmm1, [esp + fjy]
+       addps  xmm2, [esp + fjz]
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H2 interactions */
+       movaps  xmm4, xmm5      
+       mulps   xmm4, xmm4      /* xmm5=rinv, xmm4=rinvsq */
+       mulps  xmm5, [esp + qqH]        /* xmm5=vcoul */
+       mulps  xmm4, xmm5               /* total fsH1 in xmm4 */
+       
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm0, [esp + dxH2]
+       movaps xmm1, [esp + dyH2]
+       movaps xmm2, [esp + dzH2]
+       movaps [esp + vctot], xmm5
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H2 forces */
+       movaps xmm3, [esp + fixH2]
+       movaps xmm4, [esp + fiyH2]
+       movaps xmm7, [esp + fizH2]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH2], xmm3
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm7
+
+       mov edi, [ebp +faction]
+       /* update j forces */
+       addps xmm0, [esp + fjx]
+       addps xmm1, [esp + fjy]
+       addps xmm2, [esp + fjz]
+
+       movlps xmm4, [edi + eax*4]
+       movlps xmm7, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm7, [edi + edx*4]
+       
+       movaps xmm3, xmm4
+       shufps xmm3, xmm7, 0b10001000
+       shufps xmm4, xmm7, 0b11011101                         
+       /* xmm3 has fjx, xmm4 has fjy */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       /* unpack them back for storing */
+       movaps xmm7, xmm3
+       unpcklps xmm7, xmm4
+       unpckhps xmm3, xmm4     
+       movlps [edi + eax*4], xmm7
+       movlps [edi + ecx*4], xmm3
+       movhps [edi + ebx*4], xmm7
+       movhps [edi + edx*4], xmm3
+       /* finally z forces */
+       movss  xmm0, [edi + eax*4 + 8]
+       movss  xmm1, [edi + ebx*4 + 8]
+       movss  xmm3, [edi + ecx*4 + 8]
+       movss  xmm4, [edi + edx*4 + 8]
+       subss  xmm0, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm1, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm3, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm4, xmm2
+       movss  [edi + eax*4 + 8], xmm0
+       movss  [edi + ebx*4 + 8], xmm1
+       movss  [edi + ecx*4 + 8], xmm3
+       movss  [edi + edx*4 + 8], xmm4
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1020_odd_inner
+       jmp   i1020_unroll_loop
+i1020_odd_inner:       
+       add   [esp + innerk],  4
+       jnz   i1020_odd_loop
+       jmp   i1020_updateouterdata
+i1020_odd_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       xorps xmm4, xmm4
+       movss xmm4, [esp + iqO]
+       mov esi, [ebp + charge] 
+       movhps xmm4, [esp + iqH]     
+       movss xmm3, [esi + eax*4]       /* charge in xmm3 */
+       shufps xmm3, xmm3, 0
+       mulps xmm3, xmm4
+       movaps [esp + qqO], xmm3        /* use oxygen qq for storage */
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+       
+       /* move j coords to xmm0-xmm2 */
+       movss xmm0, [esi + eax*4]
+       movss xmm1, [esi + eax*4 + 4]
+       movss xmm2, [esi + eax*4 + 8]
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       
+       movss xmm3, [esp + ixO]
+       movss xmm4, [esp + iyO]
+       movss xmm5, [esp + izO]
+               
+       movlps xmm6, [esp + ixH1]
+       movlps xmm7, [esp + ixH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm3, xmm6
+       movlps xmm6, [esp + iyH1]
+       movlps xmm7, [esp + iyH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm4, xmm6
+       movlps xmm6, [esp + izH1]
+       movlps xmm7, [esp + izH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm5, xmm6
+
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       
+       movaps [esp + dxO], xmm3
+       movaps [esp + dyO], xmm4
+       movaps [esp + dzO], xmm5
+
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm3, [esp + qqO]
+
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+
+       movaps [esp + vctot], xmm3
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       movss  xmm3, [esp + fixO]       
+       movss  xmm4, [esp + fiyO]       
+       movss  xmm5, [esp + fizO]       
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esp + fixO], xmm3       
+       movss  [esp + fiyO], xmm4       
+       movss  [esp + fizO], xmm5       /* updated the O force now do the H's */
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       shufps xmm3, xmm3, 0b11100110   /* shift right */
+       shufps xmm4, xmm4, 0b11100110
+       shufps xmm5, xmm5, 0b11100110
+       addss  xmm3, [esp + fixH1]
+       addss  xmm4, [esp + fiyH1]
+       addss  xmm5, [esp + fizH1]
+       movss  [esp + fixH1], xmm3      
+       movss  [esp + fiyH1], xmm4      
+       movss  [esp + fizH1], xmm5      /* updated the H1 force */
+
+       mov edi, [ebp + faction]
+       shufps xmm3, xmm3, 0b11100111   /* shift right */
+       shufps xmm4, xmm4, 0b11100111
+       shufps xmm5, xmm5, 0b11100111
+       addss  xmm3, [esp + fixH2]
+       addss  xmm4, [esp + fiyH2]
+       addss  xmm5, [esp + fizH2]
+       movss  [esp + fixH2], xmm3      
+       movss  [esp + fiyH2], xmm4      
+       movss  [esp + fizH2], xmm5      /* updated the H2 force */
+
+       /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+       xorps  xmm5, xmm5
+       movaps xmm3, xmm0
+       movlps xmm6, [edi + eax*4]
+       movss  xmm7, [edi + eax*4 + 8]
+       unpcklps xmm3, xmm1
+       movlhps  xmm3, xmm5     
+       unpckhps xmm0, xmm1             
+       addps    xmm0, xmm3
+       movhlps  xmm3, xmm0     
+       addps    xmm0, xmm3     /* x,y sum in xmm0 */
+
+       movhlps  xmm1, xmm2
+       addss    xmm2, xmm1
+       shufps   xmm1, xmm1, 1 
+       addss    xmm2, xmm1    /* z sum in xmm2 */
+       subps    xmm6, xmm0
+       subss    xmm7, xmm2
+       
+       movlps [edi + eax*4],     xmm6
+       movss  [edi + eax*4 + 8], xmm7
+
+       dec   dword ptr [esp + innerk]
+       jz    i1020_updateouterdata
+       jmp   i1020_odd_loop
+i1020_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO]
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       mov   edx, [ebp + gid]  
+       mov   edx, [edx]
+       add   [ebp + gid],  4   
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+        
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7       
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz i1020_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1020_outer
+i1020_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 612
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+       
+.globl inl1030_sse
+       .type inl1030_sse,@function
+inl1030_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */      
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           jxO,            144
+.equ           jyO,            160
+.equ           jzO,            176
+.equ           jxH1,           192
+.equ           jyH1,           208
+.equ           jzH1,           224
+.equ           jxH2,           240
+.equ           jyH2,           256
+.equ           jzH2,           272
+.equ           dxOO,           288
+.equ           dyOO,           304
+.equ           dzOO,           320     
+.equ           dxOH1,          336
+.equ           dyOH1,          352
+.equ           dzOH1,          368     
+.equ           dxOH2,          384
+.equ           dyOH2,          400
+.equ           dzOH2,          416     
+.equ           dxH1O,          432
+.equ           dyH1O,          448
+.equ           dzH1O,          464     
+.equ           dxH1H1,         480
+.equ           dyH1H1,         496
+.equ           dzH1H1,         512     
+.equ           dxH1H2,         528
+.equ           dyH1H2,         544
+.equ           dzH1H2,         560     
+.equ           dxH2O,          576
+.equ           dyH2O,          592
+.equ           dzH2O,          608     
+.equ           dxH2H1,         624
+.equ           dyH2H1,         640
+.equ           dzH2H1,         656     
+.equ           dxH2H2,         672
+.equ           dyH2H2,         688
+.equ           dzH2H2,         704
+.equ           qqOO,           720
+.equ           qqOH,           736
+.equ           qqHH,           752
+.equ           vctot,          768             
+.equ           fixO,           784
+.equ           fiyO,           800
+.equ           fizO,           816
+.equ           fixH1,          832
+.equ           fiyH1,          848
+.equ           fizH1,          864
+.equ           fixH2,          880
+.equ           fiyH2,          896
+.equ           fizH2,          912
+.equ           fjxO,           928
+.equ           fjyO,           944
+.equ           fjzO,           960
+.equ           fjxH1,          976
+.equ           fjyH1,          992
+.equ           fjzH1,         1008
+.equ           fjxH2,         1024
+.equ           fjyH2,         1040
+.equ           fjzH2,         1056
+.equ           half,          1072
+.equ           three,         1088
+.equ           rsqOO,         1104
+.equ           rsqOH1,        1120
+.equ           rsqOH2,        1136
+.equ           rsqH1O,        1152
+.equ           rsqH1H1,       1168
+.equ           rsqH1H2,       1184
+.equ           rsqH2O,        1200
+.equ           rsqH2H1,       1216
+.equ           rsqH2H2,       1232
+.equ           rinvOO,        1248
+.equ           rinvOH1,       1264
+.equ           rinvOH2,       1280
+.equ           rinvH1O,       1296
+.equ           rinvH1H1,      1312
+.equ           rinvH1H2,      1328
+.equ           rinvH2O,       1344
+.equ           rinvH2H1,      1360
+.equ           rinvH2H2,      1376
+.equ           is3,           1392
+.equ           ii3,           1396
+.equ           innerjjnr,     1400
+.equ           innerk,        1404
+.equ           salign,        1408                                                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 1412           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, xmm3        
+       movss xmm5, [edx + ebx*4 + 4]   
+       movss xmm6, [ebp + facel]
+       mulss  xmm3, xmm3
+       mulss  xmm4, xmm5
+       mulss  xmm5, xmm5
+       mulss  xmm3, xmm6
+       mulss  xmm4, xmm6
+       mulss  xmm5, xmm6
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + qqOO], xmm3
+       movaps [esp + qqOH], xmm4
+       movaps [esp + qqHH], xmm5
+
+i1030_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx  
+       
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1030_unroll_loop
+       jmp   i1030_single_check
+i1030_unroll_loop:     
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4] 
+       mov   ecx, [edx + 8]
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+       
+       /* move j coordinates to local temp variables */
+       movlps xmm2, [esi + eax*4]
+       movlps xmm3, [esi + eax*4 + 12]
+       movlps xmm4, [esi + eax*4 + 24]
+
+       movlps xmm5, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 12]
+       movlps xmm7, [esi + ebx*4 + 24]
+
+       movhps xmm2, [esi + ecx*4]
+       movhps xmm3, [esi + ecx*4 + 12]
+       movhps xmm4, [esi + ecx*4 + 24]
+
+       movhps xmm5, [esi + edx*4]
+       movhps xmm6, [esi + edx*4 + 12]
+       movhps xmm7, [esi + edx*4 + 24]
+
+       /* current state: */    
+       /* xmm2= jxOa  jyOa  jxOc  jyOc */
+       /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+       /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+       /* xmm5= jxOb  jyOb  jxOd  jyOd */
+       /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+       /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+       
+       movaps xmm0, xmm2
+       movaps xmm1, xmm3
+       unpcklps xmm0, xmm5     /* xmm0= jxOa  jxOb  jyOa  jyOb */
+       unpcklps xmm1, xmm6     /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+       unpckhps xmm2, xmm5     /* xmm2= jxOc  jxOd  jyOc  jyOd */
+       unpckhps xmm3, xmm6     /* xmm3= jxH1c jxH1d jyH1c jyH1d  */
+       movaps xmm5, xmm4
+       movaps   xmm6, xmm0
+       unpcklps xmm4, xmm7     /* xmm4= jxH2a jxH2b jyH2a jyH2b */             
+       unpckhps xmm5, xmm7     /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+       movaps   xmm7, xmm1
+       movlhps  xmm0, xmm2     /* xmm0= jxOa  jxOb  jxOc  jxOd  */
+       movaps [esp + jxO], xmm0
+       movhlps  xmm2, xmm6     /* xmm2= jyOa  jyOb  jyOc  jyOd */
+       movaps [esp + jyO], xmm2
+       movlhps  xmm1, xmm3
+       movaps [esp + jxH1], xmm1
+       movhlps  xmm3, xmm7
+       movaps   xmm6, xmm4
+       movaps [esp + jyH1], xmm3
+       movlhps  xmm4, xmm5
+       movaps [esp + jxH2], xmm4
+       movhlps  xmm5, xmm6
+       movaps [esp + jyH2], xmm5
+
+       movss  xmm0, [esi + eax*4 + 8]
+       movss  xmm1, [esi + eax*4 + 20]
+       movss  xmm2, [esi + eax*4 + 32]
+
+       movss  xmm3, [esi + ecx*4 + 8]
+       movss  xmm4, [esi + ecx*4 + 20]
+       movss  xmm5, [esi + ecx*4 + 32]
+
+       movhps xmm0, [esi + ebx*4 + 4]
+       movhps xmm1, [esi + ebx*4 + 16]
+       movhps xmm2, [esi + ebx*4 + 28]
+       
+       movhps xmm3, [esi + edx*4 + 4]
+       movhps xmm4, [esi + edx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 28]
+       
+       shufps xmm0, xmm3, 0b11001100
+       shufps xmm1, xmm4, 0b11001100
+       shufps xmm2, xmm5, 0b11001100
+       movaps [esp + jzO],  xmm0
+       movaps [esp + jzH1],  xmm1
+       movaps [esp + jzH2],  xmm2
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixO]
+       movaps xmm4, [esp + iyO]
+       movaps xmm5, [esp + izO]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxOH1], xmm3
+       movaps [esp + dyOH1], xmm4
+       movaps [esp + dzOH1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOO], xmm0
+       movaps [esp + rsqOH1], xmm3
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       subps  xmm3, [esp + jxO]
+       subps  xmm4, [esp + jyO]
+       subps  xmm5, [esp + jzO]
+       movaps [esp + dxOH2], xmm0
+       movaps [esp + dyOH2], xmm1
+       movaps [esp + dzOH2], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1O], xmm3
+       movaps [esp + dyH1O], xmm4
+       movaps [esp + dzH1O], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOH2], xmm0
+       movaps [esp + rsqH1O], xmm3
+
+       movaps xmm0, [esp + ixH1]
+       movaps xmm1, [esp + iyH1]
+       movaps xmm2, [esp + izH1]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH1]
+       subps  xmm1, [esp + jyH1]
+       subps  xmm2, [esp + jzH1]
+       subps  xmm3, [esp + jxH2]
+       subps  xmm4, [esp + jyH2]
+       subps  xmm5, [esp + jzH2]
+       movaps [esp + dxH1H1], xmm0
+       movaps [esp + dyH1H1], xmm1
+       movaps [esp + dzH1H1], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1H2], xmm3
+       movaps [esp + dyH1H2], xmm4
+       movaps [esp + dzH1H2], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqH1H1], xmm0
+       movaps [esp + rsqH1H2], xmm3
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxH2O], xmm0
+       movaps [esp + dyH2O], xmm1
+       movaps [esp + dzH2O], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH2H1], xmm3
+       movaps [esp + dyH2H1], xmm4
+       movaps [esp + dzH2H1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       movaps [esp + rsqH2O], xmm0
+       movaps [esp + rsqH2H1], xmm4
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       movaps [esp + dxH2H2], xmm0
+       movaps [esp + dyH2H2], xmm1
+       movaps [esp + dzH2H2], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2
+       movaps [esp + rsqH2H2], xmm0
+               
+       /* start doing invsqrt use rsq values in xmm0, xmm4 */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinvH2H2 */
+       mulps   xmm7, [esp + half] /* rinvH2H1 */
+       movaps  [esp + rinvH2H2], xmm3
+       movaps  [esp + rinvH2H1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOO]
+       rsqrtps xmm5, [esp + rsqOH1]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOO]
+       mulps   xmm5, [esp + rsqOH1]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOO], xmm3
+       movaps  [esp + rinvOH1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOH2]
+       rsqrtps xmm5, [esp + rsqH1O]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOH2]
+       mulps   xmm5, [esp + rsqH1O]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOH2], xmm3
+       movaps  [esp + rinvH1O], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH1H1]
+       rsqrtps xmm5, [esp + rsqH1H2]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqH1H1]
+       mulps   xmm5, [esp + rsqH1H2]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvH1H1], xmm3
+       movaps  [esp + rinvH1H2], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH2O]
+       movaps  xmm2, xmm1
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, [esp + rsqH2O]
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2
+       mulps   xmm3, [esp + half] 
+       movaps  [esp + rinvH2O], xmm3
+
+       /* start with OO interaction */
+       movaps xmm0, [esp + rinvOO]
+       movaps xmm7, xmm0
+       mulps  xmm0, xmm0
+       mulps  xmm7, [esp + qqOO]
+       mulps  xmm0, xmm7       
+       addps  xmm7, [esp + vctot] 
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOO]
+       mulps xmm1, [esp + dyOO]
+       mulps xmm2, [esp + dzOO]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H1 interaction */
+       movaps xmm0, [esp + rinvOH1]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsOH1  */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH1]
+       mulps xmm1, [esp + dyOH1]
+       mulps xmm2, [esp + dzOH1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H2 interaction */ 
+       movaps xmm0, [esp + rinvOH2]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsOH2 */ 
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH2]
+       mulps xmm1, [esp + dyOH2]
+       mulps xmm2, [esp + dzOH2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* H1-O interaction */
+       movaps xmm0, [esp + rinvH1O]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsH1O */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH1O]
+       mulps xmm1, [esp + dyH1O]
+       mulps xmm2, [esp + dzH1O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H1 interaction */
+       movaps xmm0, [esp + rinvH1H1]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsH1H1 */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH1H1]
+       mulps xmm1, [esp + dyH1H1]
+       mulps xmm2, [esp + dzH1H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H2 interaction */
+       movaps xmm0, [esp + rinvH1H2]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsOH2 */ 
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH1H2]
+       mulps xmm1, [esp + dyH1H2]
+       mulps xmm2, [esp + dzH1H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H2-O interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsH2O */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH2O]
+       mulps xmm1, [esp + dyH2O]
+       mulps xmm2, [esp + dzH2O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H1 interaction */
+       movaps xmm0, [esp + rinvH2H1]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsH2H1 */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH2H1]
+       mulps xmm1, [esp + dyH2H1]
+       mulps xmm2, [esp + dzH2H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H2 interaction */
+       movaps xmm0, [esp + rinvH2H2]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsH2H2 */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps [esp + vctot], xmm7
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH2H2]
+       mulps xmm1, [esp + dyH2H2]
+       mulps xmm2, [esp + dzH2H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       mov edi, [ebp +faction]
+               
+       /* Did all interactions - now update j forces */
+       /* 4 j waters with three atoms each - first do a & b j particles */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpcklps xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjxOb  fjyOb */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOb  fjyOb */ 
+       unpcklps xmm1, xmm2        /* xmm1= fjzOa  fjxH1a fjzOb  fjxH1b */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpcklps xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1b fjzH1b */
+       unpcklps xmm5, xmm6        /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjzOa  fjxH1a */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOb  fjyOb  fjzOb  fjxH1b */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+       movups   xmm1, [edi + eax*4]
+       movups   xmm2, [edi + eax*4 + 16]
+       movups   xmm5, [edi + ebx*4]
+       movups   xmm6, [edi + ebx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + eax*4 + 32]
+       movss    xmm3, [edi + ebx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm7, xmm7, 1
+       
+       movups   [edi + eax*4],     xmm1
+       movups   [edi + eax*4 + 16],xmm2
+       movups   [edi + ebx*4],     xmm5
+       movups   [edi + ebx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + eax*4 + 32], xmm0
+       movss    [edi + ebx*4 + 32], xmm3       
+
+       /* then do the second pair (c & d) */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpckhps xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjxOd  fjyOd */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOd  fjyOd */ 
+       unpckhps xmm1, xmm2        /* xmm1= fjzOc  fjxH1c fjzOd  fjxH1d */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpckhps xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1d fjzH1d */     
+       unpckhps xmm5, xmm6        /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjzOc  fjxH1c */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOd  fjyOd  fjzOd  fjxH1d */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c  */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+       movups   xmm1, [edi + ecx*4]
+       movups   xmm2, [edi + ecx*4 + 16]
+       movups   xmm5, [edi + edx*4]
+       movups   xmm6, [edi + edx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + ecx*4 + 32]
+       movss    xmm3, [edi + edx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm4, xmm4, 0b10
+       shufps   xmm7, xmm7, 0b11
+       movups   [edi + ecx*4],     xmm1
+       movups   [edi + ecx*4 + 16],xmm2
+       movups   [edi + edx*4],     xmm5
+       movups   [edi + edx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + ecx*4 + 32], xmm0
+       movss    [edi + edx*4 + 32], xmm3       
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1030_single_check
+       jmp   i1030_unroll_loop
+i1030_single_check:
+       add   [esp + innerk],  4
+       jnz   i1030_single_loop
+       jmp   i1030_updateouterdata
+i1030_single_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+
+       /* fetch j coordinates */
+       xorps xmm3, xmm3
+       xorps xmm4, xmm4
+       xorps xmm5, xmm5
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + eax*4 + 4]
+       movss xmm5, [esi + eax*4 + 8]
+
+       movlps xmm6, [esi + eax*4 + 12]
+       movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+       /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+       movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+       shufps xmm6, xmm6, 0b11011000    /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+       movlhps xmm3, xmm6              /* xmm3= jxO   0  jxH1 jxH2 */
+       movaps  xmm0, [esp + ixO]     
+       movaps  xmm1, [esp + iyO]
+       movaps  xmm2, [esp + izO]       
+       shufps  xmm4, xmm6, 0b11100100 /* xmm4= jyO   0   jyH1 jyH2 */
+       shufps xmm5, xmm7, 0b11000100  /* xmm5= jzO   0   jzH1 jzH2 */
+       /* store all j coordinates in jO */ 
+       movaps [esp + jxO], xmm3
+       movaps [esp + jyO], xmm4
+       movaps [esp + jzO], xmm5
+       subps  xmm0, xmm3
+       subps  xmm1, xmm4
+       subps  xmm2, xmm5
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2        /* have rsq in xmm0 */
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       movaps  xmm2, xmm1      
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, xmm0
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2                                                      
+       mulps   xmm3, [esp + half] /* rinv iO - j water */
+
+       xorps   xmm1, xmm1
+       movaps  xmm0, xmm3
+       xorps   xmm4, xmm4
+       mulps   xmm0, xmm0      /* xmm0=rinvsq */
+       /* fetch charges to xmm4 (temporary) */
+       movss   xmm4, [esp + qqOO]
+
+       movhps  xmm4, [esp + qqOH]
+
+       mulps   xmm3, xmm4      /* xmm3=vcoul */
+       mulps   xmm0, xmm3      /* total fscal */
+       addps   xmm3, [esp + vctot]
+       movaps  [esp + vctot], xmm3     
+
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxOO]
+       mulps   xmm1, [esp + dyOO]
+       mulps   xmm2, [esp + dzOO]
+       /* initial update for j forces */
+       xorps   xmm3, xmm3
+       xorps   xmm4, xmm4
+       xorps   xmm5, xmm5
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixO]
+       addps   xmm1, [esp + fiyO]
+       addps   xmm2, [esp + fizO]
+       movaps  [esp + fixO], xmm0
+       movaps  [esp + fiyO], xmm1
+       movaps  [esp + fizO], xmm2
+
+       
+       /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+       movaps  xmm0, [esp + ixH1]
+       movaps  xmm1, [esp + iyH1]
+       movaps  xmm2, [esp + izH1]      
+       movaps  xmm3, [esp + ixH2] 
+       movaps  xmm4, [esp + iyH2] 
+       movaps  xmm5, [esp + izH2] 
+       subps   xmm0, [esp + jxO]
+       subps   xmm1, [esp + jyO]
+       subps   xmm2, [esp + jzO]
+       subps   xmm3, [esp + jxO]
+       subps   xmm4, [esp + jyO]
+       subps   xmm5, [esp + jzO]
+       movaps [esp + dxH1O], xmm0
+       movaps [esp + dyH1O], xmm1
+       movaps [esp + dzH1O], xmm2
+       movaps [esp + dxH2O], xmm3
+       movaps [esp + dyH2O], xmm4
+       movaps [esp + dzH2O], xmm5
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       mulps xmm3, xmm3
+       mulps xmm4, xmm4
+       mulps xmm5, xmm5
+       addps xmm0, xmm1
+       addps xmm4, xmm3
+       addps xmm0, xmm2        /* have rsqH1 in xmm0 */
+       addps xmm4, xmm5        /* have rsqH2 in xmm4 */
+
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinv H1 - j water */
+       mulps   xmm7, [esp + half] /* rinv H2 - j water */ 
+
+       /* assemble charges in xmm6 */
+       xorps   xmm6, xmm6
+       /* do coulomb interaction */
+       movaps  xmm0, xmm3
+       movss   xmm6, [esp + qqOH]
+       movaps  xmm4, xmm7
+       movhps  xmm6, [esp + qqHH]
+       mulps   xmm0, xmm0      /* rinvsq */
+       mulps   xmm4, xmm4      /* rinvsq */
+       mulps   xmm3, xmm6      /* vcoul */
+       mulps   xmm7, xmm6      /* vcoul */
+       movaps  xmm2, xmm3
+       addps   xmm2, xmm7      /* total vcoul */
+       mulps   xmm0, xmm3      /* fscal */
+       
+       addps   xmm2, [esp + vctot]
+       mulps   xmm7, xmm4      /* fscal */
+       movaps  [esp + vctot], xmm2
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxH1O]
+       mulps   xmm1, [esp + dyH1O]
+       mulps   xmm2, [esp + dzH1O]
+       /* update forces H1 - j water */
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH1]
+       addps   xmm1, [esp + fiyH1]
+       addps   xmm2, [esp + fizH1]
+       movaps  [esp + fixH1], xmm0
+       movaps  [esp + fiyH1], xmm1
+       movaps  [esp + fizH1], xmm2
+       /* do forces H2 - j water */
+       movaps xmm0, xmm7
+       movaps xmm1, xmm7
+       movaps xmm2, xmm7
+       mulps   xmm0, [esp + dxH2O]
+       mulps   xmm1, [esp + dyH2O]
+       mulps   xmm2, [esp + dzH2O]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       mov     esi, [ebp + faction]
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH2]
+       addps   xmm1, [esp + fiyH2]
+       addps   xmm2, [esp + fizH2]
+       movaps  [esp + fixH2], xmm0
+       movaps  [esp + fiyH2], xmm1
+       movaps  [esp + fizH2], xmm2
+
+       /* update j water forces from local variables */
+       movlps  xmm0, [esi + eax*4]
+       movlps  xmm1, [esi + eax*4 + 12]
+       movhps  xmm1, [esi + eax*4 + 24]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       movaps  xmm6, xmm5
+       movaps  xmm7, xmm5
+       shufps  xmm6, xmm6, 0b10
+       shufps  xmm7, xmm7, 0b11
+       addss   xmm5, [esi + eax*4 + 8]
+       addss   xmm6, [esi + eax*4 + 20]
+       addss   xmm7, [esi + eax*4 + 32]
+       movss   [esi + eax*4 + 8], xmm5
+       movss   [esi + eax*4 + 20], xmm6
+       movss   [esi + eax*4 + 32], xmm7
+       movaps   xmm5, xmm3
+       unpcklps xmm3, xmm4
+       unpckhps xmm5, xmm4
+       addps    xmm0, xmm3
+       addps    xmm1, xmm5
+       movlps  [esi + eax*4], xmm0 
+       movlps  [esi + eax*4 + 12], xmm1 
+       movhps  [esi + eax*4 + 24], xmm1 
+       
+       dec   dword ptr [esp + innerk]
+       jz    i1030_updateouterdata
+       jmp   i1030_single_loop
+i1030_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO] 
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7       
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz i1030_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1030_outer
+i1030_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 1412
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+
+
+
+.globl inl1100_sse
+       .type inl1100_sse,@function
+inl1100_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96     
+.equ           c6,             112
+.equ           c12,            128
+.equ           six,            144
+.equ           twelve,         160              
+.equ           vctot,          176
+.equ           vnbtot,         192
+.equ           fix,            208
+.equ           fiy,            224
+.equ           fiz,            240
+.equ           half,           256
+.equ           three,          272
+.equ           is3,            288
+.equ           ii3,            292
+.equ           ntia,           296     
+.equ           innerjjnr,      300
+.equ           innerk,         304
+.equ           salign,         308                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp,  312           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+i1100_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1100_unroll_loop
+       jmp   i1100_finish_inner
+i1100_unroll_loop:     
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0
+       shufps xmm4, xmm7, 0
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm2
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm3
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1100_finish_inner
+       jmp   i1100_unroll_loop
+i1100_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   i1100_dopair
+       jmp   i1100_checksingle
+i1100_dopair:  
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       xorps xmm3, xmm3
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0b00001100 
+       shufps xmm3, xmm3, 0b01011000 /* xmm3(0,1) has the charges */
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       xorps  xmm7,xmm7
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       mulps  xmm3, [esp + iq]
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm3
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+i1100_checksingle:                             
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    i1100_dosingle
+       jmp    i1100_updateouterdata
+i1100_dosingle:                        
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       xorps xmm3, xmm3
+       mov   eax, [ecx]
+       movss xmm3, [esi + eax*4]       /* xmm3(0) has the charge */    
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       xorps  xmm6, xmm6
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+ 
+       mulps  xmm3, [esp + iq]
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm3
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+i1100_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz i1100_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1100_outer
+i1100_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp,  312
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl2100_sse
+       .type inl2100_sse,@function
+inl2100_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           krf,            60      
+.equ           crf,            64      
+.equ           type,           68
+.equ           ntype,          72
+.equ           nbfp,           76      
+.equ           Vnb,            80      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96     
+.equ           c6,             112
+.equ           c12,            128
+.equ           six,            144
+.equ           twelve,         160              
+.equ           vctot,          176
+.equ           vnbtot,         192
+.equ           fix,            208
+.equ           fiy,            224
+.equ           fiz,            240
+.equ           half,           256
+.equ           three,          272
+.equ           two,            288
+.equ           krf,            304      
+.equ           crf,            320      
+.equ           is3,            336
+.equ           ii3,            340
+.equ           ntia,           344
+.equ           innerjjnr,      348
+.equ           innerk,         352
+.equ           salign,         356                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp,  360           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movups xmm4, [sse_two]
+       movss xmm5, [ebp + krf]
+       movss xmm6, [ebp + crf]
+       
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+       movaps [esp + two], xmm4
+       shufps xmm5, xmm5, 0
+       shufps xmm6, xmm6, 0
+       movaps [esp + krf], xmm5
+       movaps [esp + crf], xmm6
+
+       /* assume we have at least one i particle - start directly */   
+.i2100_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i2100_unroll_loop
+       jmp   .i2100_finish_inner
+.i2100_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm2
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+       
+       movaps xmm7, [esp + krf]
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       mulps  xmm7, xmm4       /* xmm7=krsq */
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm6, xmm0
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+       movaps xmm1, xmm4
+       subps  xmm6, [esp + crf]
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm6, xmm3       /* xmm6=vcoul=qq*(rinv+krsq) */
+       mulps  xmm7, [esp + two]
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       subps  xmm0, xmm7
+       mulps  xmm3, xmm0
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm6
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i2100_finish_inner
+       jmp   .i2100_unroll_loop
+.i2100_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i2100_dopair
+       jmp   .i2100_checksingle
+.i2100_dopair: 
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       xorps xmm3, xmm3
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0b00001100 
+       shufps xmm3, xmm3, 0b01011000 /* xmm3(0,1) has the charges */
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       xorps  xmm7,xmm7
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       mulps  xmm3, [esp + iq]
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       movaps xmm7, [esp + krf]
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       mulps  xmm7, xmm4       /* xmm7=krsq */
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm6, xmm0
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+       movaps xmm1, xmm4
+       subps  xmm6, [esp + crf]
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm6, xmm3       /* xmm6=vcoul=qq*(rinv+krsq-crf) */
+       mulps  xmm7, [esp + two]        
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       subps  xmm0, xmm7
+       mulps  xmm3, xmm0       
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm6
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i2100_checksingle:                            
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i2100_dosingle
+       jmp    .i2100_updateouterdata
+.i2100_dosingle:                       
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       xorps xmm3, xmm3
+       mov   eax, [ecx]
+       movss xmm3, [esi + eax*4]       /* xmm3(0) has the charge */    
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       xorps  xmm6, xmm6
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+ 
+       mulps  xmm3, [esp + iq]
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       movaps xmm7, [esp + krf]
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       mulps  xmm7, xmm4       /* xmm7=krsq */
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm6, xmm0
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+       movaps xmm1, xmm4
+       subps  xmm6, [esp + crf]        
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm6, xmm3       /* xmm6=vcoul */
+       mulps  xmm7, [esp + two]
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       subps  xmm0, xmm7
+       mulps  xmm3, xmm0
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm6, [esp + vctot]
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm6
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i2100_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i2100_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i2100_outer
+.i2100_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp,  360
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl2000_sse
+       .type inl2000_sse,@function
+inl2000_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           krf,            60      
+.equ           crf,            64
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96     
+.equ           vctot,          112
+.equ           fix,            128
+.equ           fiy,            144
+.equ           fiz,            160
+.equ           half,           176
+.equ           three,          192
+.equ           two,            208
+.equ           krf,            224      
+.equ           crf,            240      
+.equ           is3,            256
+.equ           ii3,            260
+.equ           innerjjnr,      264
+.equ           innerk,         268
+.equ           salign,         272                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp,  276           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm4, [sse_two]
+       movss xmm5, [ebp + krf]
+       movss xmm6, [ebp + crf]
+       
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + two], xmm4
+       shufps xmm5, xmm5, 0
+       movaps [esp + krf], xmm5
+       shufps xmm6, xmm6, 0
+       movaps [esp + crf], xmm6
+
+       /* assume we have at least one i particle - start directly */   
+.i2000_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i2000_unroll_loop
+       jmp   .i2000_finish_inner
+.i2000_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm2
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+       
+       movaps xmm7, [esp + krf]
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       mulps  xmm7, xmm4       /* xmm7=krsq */
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm6, xmm0
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+
+       subps  xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+       mulps  xmm6, xmm3       /* xmm6=vcoul=qq*(rinv+krsq) */
+       mulps  xmm7, [esp + two]
+
+       subps  xmm0, xmm7
+       mulps  xmm3, xmm0       
+       mulps  xmm4, xmm3       /* xmm4=total fscal */
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm6
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i2000_finish_inner
+       jmp   .i2000_unroll_loop
+.i2000_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i2000_dopair
+       jmp   .i2000_checksingle
+.i2000_dopair: 
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       xorps xmm3, xmm3
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0b00001100 
+       shufps xmm3, xmm3, 0b01011000 /* xmm3(0,1) has the charges */   
+
+       mov edi, [ebp + pos]    
+                               
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       mulps  xmm3, [esp + iq]
+
+       xorps  xmm7,xmm7
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       movaps xmm7, [esp + krf]
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       mulps  xmm7, xmm4       /* xmm7=krsq */
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm6, xmm0
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+
+       subps  xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+       mulps  xmm6, xmm3       /* xmm6=vcoul=qq*(rinv+krsq-crf) */
+       mulps  xmm7, [esp + two]        
+
+       subps  xmm0, xmm7
+       mulps  xmm3, xmm0       
+
+       mulps  xmm4, xmm3       /* xmm4=total fscal */
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm6
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i2000_checksingle:                            
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i2000_dosingle
+       jmp    .i2000_updateouterdata
+.i2000_dosingle:                       
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       xorps xmm3, xmm3
+       mov   eax, [ecx]
+       movss xmm3, [esi + eax*4]       /* xmm3(0) has the charge */            
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+ 
+       mulps  xmm3, [esp + iq]
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       movaps xmm7, [esp + krf]
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       mulps  xmm7, xmm4       /* xmm7=krsq */
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm6, xmm0
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+
+       subps  xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+
+       mulps  xmm6, xmm3       /* xmm6=vcoul */
+       mulps  xmm7, [esp + two]
+
+       subps  xmm0, xmm7
+       mulps  xmm3, xmm0
+       mulps  xmm4, xmm3       /* xmm4=total fscal */
+       addps  xmm6, [esp + vctot]
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm6
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i2000_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i2000_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i2000_outer
+.i2000_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp,  276
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+
+.globl inl1110_sse
+       .type inl1110_sse,@function
+inl1110_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72      
+.equ           nsatoms,        76              
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96     
+.equ           c6,             112
+.equ           c12,            128
+.equ           two,            144
+.equ           six,            160
+.equ           twelve,         176              
+.equ           vctot,          192
+.equ           vnbtot,         208
+.equ           fix,            224
+.equ           fiy,            240
+.equ           fiz,            256
+.equ           half,           272
+.equ           three,          288
+.equ           is3,            304
+.equ           ii3,            308
+.equ           shX,            312
+.equ           shY,            316
+.equ           shZ,            320
+.equ           ntia,           324     
+.equ           innerjjnr0,     328
+.equ           innerk0,        332
+.equ           innerjjnr,      336
+.equ           innerk,         340
+.equ           salign,         344                                                     
+.equ           nsvdwc,         348
+.equ           nscoul,         352
+.equ           nsvdw,          356
+.equ           solnr,          360             
+       push ebp
+       mov ebp,esp     
+       push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 364            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movups xmm3, [sse_six]
+       movups xmm4, [sse_twelve]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three], xmm2
+       movaps [esp + six],  xmm3
+       movaps [esp + twelve], xmm4
+
+       /* assume we have at least one i particle - start directly */   
+i1110_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movlps xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 8] 
+       movlps [esp + shX], xmm0
+       movss [esp + shZ], xmm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       mov   [esp + solnr],  ebx
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   i1110_mno_vdwc
+       jmp   i1110_testcoul
+i1110_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1110_unroll_vdwc_loop
+       jmp   i1110_finish_vdwc_inner
+i1110_unroll_vdwc_loop:        
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm2
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm3
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1110_finish_vdwc_inner
+       jmp   i1110_unroll_vdwc_loop
+i1110_finish_vdwc_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   i1110_dopair_vdwc
+       jmp   i1110_checksingle_vdwc
+i1110_dopair_vdwc:     
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       xorps  xmm7,xmm7
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       mulps  xmm3, [esp + iq]
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm3
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+i1110_checksingle_vdwc:                                
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    i1110_dosingle_vdwc
+       jmp    i1110_updateouterdata_vdwc
+i1110_dosingle_vdwc:                   
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       movss xmm3, [esi + eax*4]       /* xmm3(0) has the charge */    
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       xorps  xmm6, xmm6
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+ 
+       mulps  xmm3, [esp + iq]
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm3
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+i1110_updateouterdata_vdwc:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdwc]
+       jz  i1110_testcoul
+       jmp i1110_mno_vdwc
+i1110_testcoul:
+       mov  ecx, [esp + nscoul]
+       cmp  ecx,  0
+       jnz  i1110_mno_coul
+       jmp  i1110_testvdw
+i1110_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1110_unroll_coul_loop
+       jmp   i1110_finish_coul_inner
+
+i1110_unroll_coul_loop:        
+       /* quad-unrolled innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm5, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000         
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       mulps xmm3, xmm5
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       mov    edi, [ebp + faction]
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1110_finish_coul_inner
+       jmp   i1110_unroll_coul_loop
+i1110_finish_coul_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   i1110_dopair_coul
+       jmp   i1110_checksingle_coul
+i1110_dopair_coul:     
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       mulps  xmm3, [esp + iq]
+       xorps  xmm7,xmm7
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */ 
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+       
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+i1110_checksingle_coul:                                
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    i1110_dosingle_coul
+       jmp    i1110_updateouterdata_coul
+i1110_dosingle_coul:                   
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       movss xmm3, [esi + eax*4]       /* xmm3(0) has the charge */    
+       
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+ 
+       mulps  xmm3, [esp + iq]
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       mov    edi, [ebp + faction]
+       movaps xmm5, [esp + vctot]
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm4, xmm3       /* xmm4=fscal */
+       addps  xmm5, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vctot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */ 
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+i1110_updateouterdata_coul:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* loop back to mno */
+       dec dword ptr [esp + nscoul]
+       jz  i1110_testvdw
+       jmp i1110_mno_coul
+i1110_testvdw:
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  i1110_mno_vdw
+       jmp  i1110_last_mno
+i1110_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1110_unroll_vdw_loop
+       jmp   i1110_finish_vdw_inner
+i1110_unroll_vdw_loop: 
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1110_finish_vdw_inner
+       jmp   i1110_unroll_vdw_loop
+i1110_finish_vdw_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   i1110_dopair_vdw
+       jmp   i1110_checksingle_vdw
+i1110_dopair_vdw:      
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       xorps  xmm7,xmm7
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+i1110_checksingle_vdw:                         
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    i1110_dosingle_vdw
+       jmp    i1110_updateouterdata_vdw
+i1110_dosingle_vdw:                    
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]                
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       xorps  xmm6, xmm6
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]   
+       
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+i1110_updateouterdata_vdw:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+       
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  i1110_last_mno
+       jmp i1110_mno_vdw
+i1110_last_mno:        
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz i1110_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1110_outer
+i1110_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 364
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl1120_sse
+       .type inl1120_sse,@function
+inl1120_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           iqO,            144 
+.equ           iqH,            160 
+.equ           dxO,            176
+.equ           dyO,            192
+.equ           dzO,            208     
+.equ           dxH1,           224
+.equ           dyH1,           240
+.equ           dzH1,           256     
+.equ           dxH2,           272
+.equ           dyH2,           288
+.equ           dzH2,           304     
+.equ           qqO,            320
+.equ           qqH,            336
+.equ           c6,             352
+.equ           c12,            368
+.equ           six,            384
+.equ           twelve,         400              
+.equ           vctot,          416
+.equ           vnbtot,         432
+.equ           fixO,           448
+.equ           fiyO,           464
+.equ           fizO,           480
+.equ           fixH1,          496
+.equ           fiyH1,          512
+.equ           fizH1,          528
+.equ           fixH2,          544
+.equ           fiyH2,          560
+.equ           fizH2,          576
+.equ           fjx,            592
+.equ           fjy,            608
+.equ           fjz,            624
+.equ           half,           640
+.equ           three,          656
+.equ           is3,            672
+.equ           ii3,            676
+.equ           ntia,           680     
+.equ           innerjjnr,      684
+.equ           innerk,         688
+.equ           salign,         692                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 696            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, [edx + ebx*4 + 4]   
+       movss xmm5, [ebp + facel]
+       mulss  xmm3, xmm5
+       mulss  xmm4, xmm5
+
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       movaps [esp + iqO], xmm3
+       movaps [esp + iqH], xmm4
+       
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       mov   [esp + ntia], ecx         
+i1120_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1120_unroll_loop
+       jmp   i1120_odd_inner
+i1120_unroll_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movaps xmm4, xmm3            /* and in xmm4 */
+       mulps  xmm3, [esp + iqO]
+       mulps  xmm4, [esp + iqH]
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps  [esp + qqO], xmm3
+       movaps  [esp + qqH], xmm4
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ixO-izO to xmm4-xmm6 */
+       movaps xmm4, [esp + ixO]
+       movaps xmm5, [esp + iyO]
+       movaps xmm6, [esp + izO]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxO], xmm4
+       movaps [esp + dyO], xmm5
+       movaps [esp + dzO], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       movaps xmm7, xmm4
+       /* rsqO in xmm7 */
+
+       /* move ixH1-izH1 to xmm4-xmm6 */
+       movaps xmm4, [esp + ixH1]
+       movaps xmm5, [esp + iyH1]
+       movaps xmm6, [esp + izH1]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxH1], xmm4
+       movaps [esp + dyH1], xmm5
+       movaps [esp + dzH1], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm6, xmm5
+       addps xmm6, xmm4
+       /* rsqH1 in xmm6 */
+
+       /* move ixH2-izH2 to xmm3-xmm5 */ 
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+
+       /* calc dr */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+
+       /* store dr */
+       movaps [esp + dxH2], xmm3
+       movaps [esp + dyH2], xmm4
+       movaps [esp + dzH2], xmm5
+       /* square it */
+       mulps xmm3,xmm3
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       addps xmm5, xmm4
+       addps xmm5, xmm3
+       /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+       /* start with rsqO - seed in xmm2 */    
+       rsqrtps xmm2, xmm7
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm7      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm7, xmm4      /* rinvO in xmm7 */
+       /* rsqH1 - seed in xmm2 */
+       rsqrtps xmm2, xmm6
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm6      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm6, xmm4      /* rinvH1 in xmm6 */
+       /* rsqH2 - seed in xmm2 */
+       rsqrtps xmm2, xmm5
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm5      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm5, xmm4      /* rinvH2 in xmm5 */
+
+       /* do O interactions */
+       movaps  xmm4, xmm7      
+       mulps   xmm4, xmm4      /* xmm7=rinv, xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm7, [esp + qqO]        /* xmm7=vcoul */
+       
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm3, xmm2
+       subps  xmm3, xmm1       /* vnb=vnb12-vnb6 */            
+       addps  xmm3, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       addps  xmm2, xmm7       
+       mulps  xmm4, xmm2       /* total fsO in xmm4 */
+
+       addps  xmm7, [esp + vctot]
+       
+       movaps [esp + vnbtot], xmm3
+       movaps [esp + vctot], xmm7
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update O forces */
+       movaps xmm3, [esp + fixO]
+       movaps xmm4, [esp + fiyO]
+       movaps xmm7, [esp + fizO]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixO], xmm3
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm7
+       /* update j forces with water O */
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H1 interactions */
+       movaps  xmm4, xmm6      
+       mulps   xmm4, xmm4      /* xmm6=rinv, xmm4=rinvsq */
+       mulps  xmm6, [esp + qqH]        /* xmm6=vcoul */
+       mulps  xmm4, xmm6               /* total fsH1 in xmm4 */
+       
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dxH1]
+       movaps xmm1, [esp + dyH1]
+       movaps xmm2, [esp + dzH1]
+       movaps [esp + vctot], xmm6
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H1 forces */
+       movaps xmm3, [esp + fixH1]
+       movaps xmm4, [esp + fiyH1]
+       movaps xmm7, [esp + fizH1]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH1], xmm3
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm7
+       /* update j forces with water H1 */
+       addps  xmm0, [esp + fjx]
+       addps  xmm1, [esp + fjy]
+       addps  xmm2, [esp + fjz]
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H2 interactions */
+       movaps  xmm4, xmm5      
+       mulps   xmm4, xmm4      /* xmm5=rinv, xmm4=rinvsq */
+       mulps  xmm5, [esp + qqH]        /* xmm5=vcoul */
+       mulps  xmm4, xmm5               /* total fsH1 in xmm4 */
+       
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm0, [esp + dxH2]
+       movaps xmm1, [esp + dyH2]
+       movaps xmm2, [esp + dzH2]
+       movaps [esp + vctot], xmm5
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H2 forces */
+       movaps xmm3, [esp + fixH2]
+       movaps xmm4, [esp + fiyH2]
+       movaps xmm7, [esp + fizH2]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH2], xmm3
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm7
+
+       mov edi, [ebp +faction]
+       /* update j forces */
+       addps xmm0, [esp + fjx]
+       addps xmm1, [esp + fjy]
+       addps xmm2, [esp + fjz]
+
+       movlps xmm4, [edi + eax*4]
+       movlps xmm7, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm7, [edi + edx*4]
+       
+       movaps xmm3, xmm4
+       shufps xmm3, xmm7, 0b10001000
+       shufps xmm4, xmm7, 0b11011101                         
+       /* xmm3 has fjx, xmm4 has fjy */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       /* unpack them back for storing */
+       movaps xmm7, xmm3
+       unpcklps xmm7, xmm4
+       unpckhps xmm3, xmm4     
+       movlps [edi + eax*4], xmm7
+       movlps [edi + ecx*4], xmm3
+       movhps [edi + ebx*4], xmm7
+       movhps [edi + edx*4], xmm3
+       /* finally z forces */
+       movss  xmm0, [edi + eax*4 + 8]
+       movss  xmm1, [edi + ebx*4 + 8]
+       movss  xmm3, [edi + ecx*4 + 8]
+       movss  xmm4, [edi + edx*4 + 8]
+       subss  xmm0, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm1, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm3, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm4, xmm2
+       movss  [edi + eax*4 + 8], xmm0
+       movss  [edi + ebx*4 + 8], xmm1
+       movss  [edi + ecx*4 + 8], xmm3
+       movss  [edi + edx*4 + 8], xmm4
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1120_odd_inner
+       jmp   i1120_unroll_loop
+i1120_odd_inner:       
+       add   [esp + innerk],  4
+       jnz   i1120_odd_loop
+       jmp   i1120_updateouterdata
+i1120_odd_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       xorps xmm4, xmm4
+       movss xmm4, [esp + iqO]
+       mov esi, [ebp + charge] 
+       movhps xmm4, [esp + iqH]     
+       movss xmm3, [esi + eax*4]       /* charge in xmm3 */
+       shufps xmm3, xmm3, 0
+       mulps xmm3, xmm4
+       movaps [esp + qqO], xmm3        /* use oxygen qq for storage */
+
+       xorps xmm6, xmm6
+       mov esi, [ebp + type]
+       mov ebx, [esi + eax*4]
+       mov esi, [ebp + nbfp]
+       shl ebx, 1      
+       add ebx, [esp + ntia]
+       movlps xmm6, [esi + ebx*4]
+       movaps xmm7, xmm6
+       shufps xmm6, xmm6, 0b11111100
+       shufps xmm7, xmm7, 0b11111101
+       movaps [esp + c6], xmm6
+       movaps [esp + c12], xmm7
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+       
+       /* move j coords to xmm0-xmm2 */
+       movss xmm0, [esi + eax*4]
+       movss xmm1, [esi + eax*4 + 4]
+       movss xmm2, [esi + eax*4 + 8]
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       
+       movss xmm3, [esp + ixO]
+       movss xmm4, [esp + iyO]
+       movss xmm5, [esp + izO]
+               
+       movlps xmm6, [esp + ixH1]
+       movlps xmm7, [esp + ixH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm3, xmm6
+       movlps xmm6, [esp + iyH1]
+       movlps xmm7, [esp + iyH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm4, xmm6
+       movlps xmm6, [esp + izH1]
+       movlps xmm7, [esp + izH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm5, xmm6
+
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       
+       movaps [esp + dxO], xmm3
+       movaps [esp + dyO], xmm4
+       movaps [esp + dzO], xmm5
+
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulss  xmm1, xmm4
+       movaps xmm3, [esp + qqO]
+       mulss  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulss  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm3, xmm0       /* xmm3=vcoul */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subss  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulss  xmm1, [esp + six]
+       mulss  xmm2, [esp + twelve]
+       subss  xmm2, xmm1
+       addps  xmm2, xmm3
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm3, [esp + vctot]
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+
+       movaps [esp + vctot], xmm3
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       movss  xmm3, [esp + fixO]       
+       movss  xmm4, [esp + fiyO]       
+       movss  xmm5, [esp + fizO]       
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esp + fixO], xmm3       
+       movss  [esp + fiyO], xmm4       
+       movss  [esp + fizO], xmm5       /* updated the O force now do the H's */
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       shufps xmm3, xmm3, 0b11100110   /* shift right */
+       shufps xmm4, xmm4, 0b11100110
+       shufps xmm5, xmm5, 0b11100110
+       addss  xmm3, [esp + fixH1]
+       addss  xmm4, [esp + fiyH1]
+       addss  xmm5, [esp + fizH1]
+       movss  [esp + fixH1], xmm3      
+       movss  [esp + fiyH1], xmm4      
+       movss  [esp + fizH1], xmm5      /* updated the H1 force */
+
+       mov edi, [ebp + faction]
+       shufps xmm3, xmm3, 0b11100111   /* shift right */
+       shufps xmm4, xmm4, 0b11100111
+       shufps xmm5, xmm5, 0b11100111
+       addss  xmm3, [esp + fixH2]
+       addss  xmm4, [esp + fiyH2]
+       addss  xmm5, [esp + fizH2]
+       movss  [esp + fixH2], xmm3      
+       movss  [esp + fiyH2], xmm4      
+       movss  [esp + fizH2], xmm5      /* updated the H2 force */
+
+       /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+       xorps  xmm5, xmm5
+       movaps xmm3, xmm0
+       movlps xmm6, [edi + eax*4]
+       movss  xmm7, [edi + eax*4 + 8]
+       unpcklps xmm3, xmm1
+       movlhps  xmm3, xmm5     
+       unpckhps xmm0, xmm1             
+       addps    xmm0, xmm3
+       movhlps  xmm3, xmm0     
+       addps    xmm0, xmm3     /* x,y sum in xmm0 */
+
+       movhlps  xmm1, xmm2
+       addss    xmm2, xmm1
+       shufps   xmm1, xmm1, 1 
+       addss    xmm2, xmm1    /* z sum in xmm2 */
+       subps    xmm6, xmm0
+       subss    xmm7, xmm2
+       
+       movlps [edi + eax*4],     xmm6
+       movss  [edi + eax*4 + 8], xmm7
+
+       dec dword ptr [esp + innerk]
+       jz    i1120_updateouterdata
+       jmp   i1120_odd_loop
+i1120_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO]
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       mov   edx, [ebp + gid]  
+       mov   edx, [edx]
+       add   [ebp + gid],  4   
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+        
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz i1120_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1120_outer
+i1120_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 696
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+       
+.globl inl1130_sse
+       .type inl1130_sse,@function
+inl1130_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68
+.equ           Vnb,            72
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           jxO,            144
+.equ           jyO,            160
+.equ           jzO,            176
+.equ           jxH1,           192
+.equ           jyH1,           208
+.equ           jzH1,           224
+.equ           jxH2,           240
+.equ           jyH2,           256
+.equ           jzH2,           272
+.equ           dxOO,           288
+.equ           dyOO,           304
+.equ           dzOO,           320     
+.equ           dxOH1,          336
+.equ           dyOH1,          352
+.equ           dzOH1,          368     
+.equ           dxOH2,          384
+.equ           dyOH2,          400
+.equ           dzOH2,          416     
+.equ           dxH1O,          432
+.equ           dyH1O,          448
+.equ           dzH1O,          464     
+.equ           dxH1H1,         480
+.equ           dyH1H1,         496
+.equ           dzH1H1,         512     
+.equ           dxH1H2,         528
+.equ           dyH1H2,         544
+.equ           dzH1H2,         560     
+.equ           dxH2O,          576
+.equ           dyH2O,          592
+.equ           dzH2O,          608     
+.equ           dxH2H1,         624
+.equ           dyH2H1,         640
+.equ           dzH2H1,         656     
+.equ           dxH2H2,         672
+.equ           dyH2H2,         688
+.equ           dzH2H2,         704
+.equ           qqOO,           720
+.equ           qqOH,           736
+.equ           qqHH,           752
+.equ           c6,             768
+.equ           c12,            784
+.equ           six,            800
+.equ           twelve,         816              
+.equ           vctot,          832
+.equ           vnbtot,         848
+.equ           fixO,           864
+.equ           fiyO,           880
+.equ           fizO,           896
+.equ           fixH1,          912
+.equ           fiyH1,          928
+.equ           fizH1,          944
+.equ           fixH2,          960
+.equ           fiyH2,          976
+.equ           fizH2,          992
+.equ           fjxO,          1008
+.equ           fjyO,          1024
+.equ           fjzO,          1040
+.equ           fjxH1,         1056
+.equ           fjyH1,         1072
+.equ           fjzH1,         1088
+.equ           fjxH2,         1104
+.equ           fjyH2,         1120
+.equ           fjzH2,         1136
+.equ           half,          1152
+.equ           three,         1168
+.equ           rsqOO,         1184
+.equ           rsqOH1,        1200
+.equ           rsqOH2,        1216
+.equ           rsqH1O,        1232
+.equ           rsqH1H1,       1248
+.equ           rsqH1H2,       1264
+.equ           rsqH2O,        1280
+.equ           rsqH2H1,       1296
+.equ           rsqH2H2,       1312
+.equ           rinvOO,        1328
+.equ           rinvOH1,       1344
+.equ           rinvOH2,       1360
+.equ           rinvH1O,       1376
+.equ           rinvH1H1,      1392
+.equ           rinvH1H2,      1408
+.equ           rinvH2O,       1424
+.equ           rinvH2H1,      1440
+.equ           rinvH2H2,      1456
+.equ           is3,           1472
+.equ           ii3,           1476
+.equ           innerjjnr,     1480
+.equ           innerk,        1484
+.equ           salign,        1488                                                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 1492           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, xmm3        
+       movss xmm5, [edx + ebx*4 + 4]   
+       movss xmm6, [ebp + facel]
+       mulss  xmm3, xmm3
+       mulss  xmm4, xmm5
+       mulss  xmm5, xmm5
+       mulss  xmm3, xmm6
+       mulss  xmm4, xmm6
+       mulss  xmm5, xmm6
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + qqOO], xmm3
+       movaps [esp + qqOH], xmm4
+       movaps [esp + qqHH], xmm5
+               
+       xorps xmm0, xmm0
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       mov   edx, ecx
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       add   edx, ecx
+       mov   eax, [ebp + nbfp]
+       movlps xmm0, [eax + edx*4] 
+       movaps xmm1, xmm0
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0b01010101
+       movaps [esp + c6], xmm0
+       movaps [esp + c12], xmm1
+
+i1130_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx  
+       
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   i1130_unroll_loop
+       jmp   i1130_single_check
+i1130_unroll_loop:     
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4] 
+       mov   ecx, [edx + 8]
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+       
+       /* move j coordinates to local temp variables */
+       movlps xmm2, [esi + eax*4]
+       movlps xmm3, [esi + eax*4 + 12]
+       movlps xmm4, [esi + eax*4 + 24]
+
+       movlps xmm5, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 12]
+       movlps xmm7, [esi + ebx*4 + 24]
+
+       movhps xmm2, [esi + ecx*4]
+       movhps xmm3, [esi + ecx*4 + 12]
+       movhps xmm4, [esi + ecx*4 + 24]
+
+       movhps xmm5, [esi + edx*4]
+       movhps xmm6, [esi + edx*4 + 12]
+       movhps xmm7, [esi + edx*4 + 24]
+
+       /* current state: */    
+       /* xmm2= jxOa  jyOa  jxOc  jyOc */
+       /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+       /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+       /* xmm5= jxOb  jyOb  jxOd  jyOd */
+       /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+       /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+       
+       movaps xmm0, xmm2
+       movaps xmm1, xmm3
+       unpcklps xmm0, xmm5     /* xmm0= jxOa  jxOb  jyOa  jyOb */
+       unpcklps xmm1, xmm6     /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+       unpckhps xmm2, xmm5     /* xmm2= jxOc  jxOd  jyOc  jyOd */
+       unpckhps xmm3, xmm6     /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+       movaps xmm5, xmm4
+       movaps   xmm6, xmm0
+       unpcklps xmm4, xmm7     /* xmm4= jxH2a jxH2b jyH2a jyH2b */             
+       unpckhps xmm5, xmm7     /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+       movaps   xmm7, xmm1
+       movlhps  xmm0, xmm2     /* xmm0= jxOa  jxOb  jxOc  jxOd */
+       movaps [esp + jxO], xmm0
+       movhlps  xmm2, xmm6     /* xmm2= jyOa  jyOb  jyOc  jyOd */
+       movaps [esp + jyO], xmm2
+       movlhps  xmm1, xmm3
+       movaps [esp + jxH1], xmm1
+       movhlps  xmm3, xmm7
+       movaps   xmm6, xmm4
+       movaps [esp + jyH1], xmm3
+       movlhps  xmm4, xmm5
+       movaps [esp + jxH2], xmm4
+       movhlps  xmm5, xmm6
+       movaps [esp + jyH2], xmm5
+
+       movss  xmm0, [esi + eax*4 + 8]
+       movss  xmm1, [esi + eax*4 + 20]
+       movss  xmm2, [esi + eax*4 + 32]
+
+       movss  xmm3, [esi + ecx*4 + 8]
+       movss  xmm4, [esi + ecx*4 + 20]
+       movss  xmm5, [esi + ecx*4 + 32]
+
+       movhps xmm0, [esi + ebx*4 + 4]
+       movhps xmm1, [esi + ebx*4 + 16]
+       movhps xmm2, [esi + ebx*4 + 28]
+       
+       movhps xmm3, [esi + edx*4 + 4]
+       movhps xmm4, [esi + edx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 28]
+       
+       shufps xmm0, xmm3, 0b11001100
+       shufps xmm1, xmm4, 0b11001100
+       shufps xmm2, xmm5, 0b11001100
+       movaps [esp + jzO],  xmm0
+       movaps [esp + jzH1],  xmm1
+       movaps [esp + jzH2],  xmm2
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixO]
+       movaps xmm4, [esp + iyO]
+       movaps xmm5, [esp + izO]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxOH1], xmm3
+       movaps [esp + dyOH1], xmm4
+       movaps [esp + dzOH1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOO], xmm0
+       movaps [esp + rsqOH1], xmm3
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       subps  xmm3, [esp + jxO]
+       subps  xmm4, [esp + jyO]
+       subps  xmm5, [esp + jzO]
+       movaps [esp + dxOH2], xmm0
+       movaps [esp + dyOH2], xmm1
+       movaps [esp + dzOH2], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1O], xmm3
+       movaps [esp + dyH1O], xmm4
+       movaps [esp + dzH1O], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOH2], xmm0
+       movaps [esp + rsqH1O], xmm3
+
+       movaps xmm0, [esp + ixH1]
+       movaps xmm1, [esp + iyH1]
+       movaps xmm2, [esp + izH1]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH1]
+       subps  xmm1, [esp + jyH1]
+       subps  xmm2, [esp + jzH1]
+       subps  xmm3, [esp + jxH2]
+       subps  xmm4, [esp + jyH2]
+       subps  xmm5, [esp + jzH2]
+       movaps [esp + dxH1H1], xmm0
+       movaps [esp + dyH1H1], xmm1
+       movaps [esp + dzH1H1], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1H2], xmm3
+       movaps [esp + dyH1H2], xmm4
+       movaps [esp + dzH1H2], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqH1H1], xmm0
+       movaps [esp + rsqH1H2], xmm3
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxH2O], xmm0
+       movaps [esp + dyH2O], xmm1
+       movaps [esp + dzH2O], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH2H1], xmm3
+       movaps [esp + dyH2H1], xmm4
+       movaps [esp + dzH2H1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       movaps [esp + rsqH2O], xmm0
+       movaps [esp + rsqH2H1], xmm4
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       movaps [esp + dxH2H2], xmm0
+       movaps [esp + dyH2H2], xmm1
+       movaps [esp + dzH2H2], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2
+       movaps [esp + rsqH2H2], xmm0
+               
+       /* start doing invsqrt use rsq values in xmm0, xmm4 */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinvH2H2 */
+       mulps   xmm7, [esp + half] /* rinvH2H1 */
+       movaps  [esp + rinvH2H2], xmm3
+       movaps  [esp + rinvH2H1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOO]
+       rsqrtps xmm5, [esp + rsqOH1]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOO]
+       mulps   xmm5, [esp + rsqOH1]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOO], xmm3
+       movaps  [esp + rinvOH1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOH2]
+       rsqrtps xmm5, [esp + rsqH1O]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOH2]
+       mulps   xmm5, [esp + rsqH1O]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOH2], xmm3
+       movaps  [esp + rinvH1O], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH1H1]
+       rsqrtps xmm5, [esp + rsqH1H2]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqH1H1]
+       mulps   xmm5, [esp + rsqH1H2]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvH1H1], xmm3
+       movaps  [esp + rinvH1H2], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH2O]
+       movaps  xmm2, xmm1
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, [esp + rsqH2O]
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2
+       mulps   xmm3, [esp + half] 
+       movaps  [esp + rinvH2O], xmm3
+
+       /* start with OO interaction */
+       movaps xmm0, [esp + rinvOO]
+       movaps xmm7, xmm0
+       mulps  xmm0, xmm0
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       mulps  xmm7, [esp + qqOO]
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm1, [esp + c6] 
+       mulps  xmm2, [esp + c12]        
+       movaps xmm3, xmm2
+       subps  xmm3, xmm1       /* xmm3=vnb12-vnb6 */
+       addps  xmm3, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       movaps [esp + vnbtot], xmm3
+       subps  xmm2, xmm1
+       addps  xmm2, xmm7
+       addps  xmm7, [esp + vctot]
+       mulps  xmm0, xmm2       
+ 
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOO]
+       mulps xmm1, [esp + dyOO]
+       mulps xmm2, [esp + dzOO]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H1 interaction */
+       movaps xmm0, [esp + rinvOH1]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsOH1  */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH1]
+       mulps xmm1, [esp + dyOH1]
+       mulps xmm2, [esp + dzOH1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H2 interaction */ 
+       movaps xmm0, [esp + rinvOH2]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsOH2 */ 
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH2]
+       mulps xmm1, [esp + dyOH2]
+       mulps xmm2, [esp + dzOH2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* H1-O interaction */
+       movaps xmm0, [esp + rinvH1O]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsH1O */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH1O]
+       mulps xmm1, [esp + dyH1O]
+       mulps xmm2, [esp + dzH1O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H1 interaction */
+       movaps xmm0, [esp + rinvH1H1]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsH1H1 */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH1H1]
+       mulps xmm1, [esp + dyH1H1]
+       mulps xmm2, [esp + dzH1H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H2 interaction */
+       movaps xmm0, [esp + rinvH1H2]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsOH2 */ 
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH1H2]
+       mulps xmm1, [esp + dyH1H2]
+       mulps xmm2, [esp + dzH1H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H2-O interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqOH]
+       mulps xmm0, xmm1        /* fsH2O */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH2O]
+       mulps xmm1, [esp + dyH2O]
+       mulps xmm2, [esp + dzH2O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H1 interaction */
+       movaps xmm0, [esp + rinvH2H1]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsH2H1 */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH2H1]
+       mulps xmm1, [esp + dyH2H1]
+       mulps xmm2, [esp + dzH2H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H2 interaction */
+       movaps xmm0, [esp + rinvH2H2]
+       movaps xmm1, xmm0
+       mulps xmm0, xmm0
+       mulps xmm1, [esp + qqHH]
+       mulps xmm0, xmm1        /* fsH2H2 */
+       addps xmm7, xmm1        /* add to local vctot */
+       movaps xmm1, xmm0
+       movaps [esp + vctot], xmm7
+       movaps xmm2, xmm0
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH2H2]
+       mulps xmm1, [esp + dyH2H2]
+       mulps xmm2, [esp + dzH2H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       mov edi, [ebp +faction]
+               
+       /* Did all interactions - now update j forces */
+       /* 4 j waters with three atoms each - first do a & b j particles */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpcklps xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjxOb  fjyOb */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOb  fjyOb */ 
+       unpcklps xmm1, xmm2        /* xmm1= fjzOa  fjxH1a fjzOb  fjxH1b */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpcklps xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1b fjzH1b */
+       unpcklps xmm5, xmm6        /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjzOa  fjxH1a */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOb  fjyOb  fjzOb  fjxH1b */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+       movups   xmm1, [edi + eax*4]
+       movups   xmm2, [edi + eax*4 + 16]
+       movups   xmm5, [edi + ebx*4]
+       movups   xmm6, [edi + ebx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + eax*4 + 32]
+       movss    xmm3, [edi + ebx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm7, xmm7, 1
+       
+       movups   [edi + eax*4],     xmm1
+       movups   [edi + eax*4 + 16],xmm2
+       movups   [edi + ebx*4],     xmm5
+       movups   [edi + ebx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + eax*4 + 32], xmm0
+       movss    [edi + ebx*4 + 32], xmm3       
+
+       /* then do the second pair (c & d) */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpckhps xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjxOd  fjyOd */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOd  fjyOd */ 
+       unpckhps xmm1, xmm2        /* xmm1= fjzOc  fjxH1c fjzOd  fjxH1d */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpckhps xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1d fjzH1d */     
+       unpckhps xmm5, xmm6        /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjzOc  fjxH1c */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOd  fjyOd  fjzOd  fjxH1d */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c  */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+       movups   xmm1, [edi + ecx*4]
+       movups   xmm2, [edi + ecx*4 + 16]
+       movups   xmm5, [edi + edx*4]
+       movups   xmm6, [edi + edx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + ecx*4 + 32]
+       movss    xmm3, [edi + edx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm4, xmm4, 0b10
+       shufps   xmm7, xmm7, 0b11
+       movups   [edi + ecx*4],     xmm1
+       movups   [edi + ecx*4 + 16],xmm2
+       movups   [edi + edx*4],     xmm5
+       movups   [edi + edx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + ecx*4 + 32], xmm0
+       movss    [edi + edx*4 + 32], xmm3       
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    i1130_single_check
+       jmp   i1130_unroll_loop
+i1130_single_check:
+       add   [esp + innerk],  4
+       jnz   i1130_single_loop
+       jmp   i1130_updateouterdata
+i1130_single_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+
+       /* fetch j coordinates */
+       xorps xmm3, xmm3
+       xorps xmm4, xmm4
+       xorps xmm5, xmm5
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + eax*4 + 4]
+       movss xmm5, [esi + eax*4 + 8]
+
+       movlps xmm6, [esi + eax*4 + 12]
+       movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+       /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+       movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+       shufps xmm6, xmm6, 0b11011000    /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+       movlhps xmm3, xmm6              /* xmm3= jxO   0  jxH1 jxH2 */
+       movaps  xmm0, [esp + ixO]     
+       movaps  xmm1, [esp + iyO]
+       movaps  xmm2, [esp + izO]       
+       shufps  xmm4, xmm6, 0b11100100 /* xmm4= jyO   0   jyH1 jyH2 */
+       shufps xmm5, xmm7, 0b11000100  /* xmm5= jzO   0   jzH1 jzH2 */
+       /* store all j coordinates in jO */ 
+       movaps [esp + jxO], xmm3
+       movaps [esp + jyO], xmm4
+       movaps [esp + jzO], xmm5
+       subps  xmm0, xmm3
+       subps  xmm1, xmm4
+       subps  xmm2, xmm5
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2        /* have rsq in xmm0 */
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       movaps  xmm2, xmm1      
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, xmm0
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2                                                      
+       mulps   xmm3, [esp + half] /* rinv iO - j water */
+
+       xorps   xmm1, xmm1
+       movaps  xmm0, xmm3
+       xorps   xmm4, xmm4
+       mulps   xmm0, xmm0      /* xmm0=rinvsq */
+       /* fetch charges to xmm4 (temporary) */
+       movss   xmm4, [esp + qqOO]
+       movss   xmm1, xmm0
+       movhps  xmm4, [esp + qqOH]
+       mulss   xmm1, xmm0
+       mulps   xmm3, xmm4      /* xmm3=vcoul */
+       mulss   xmm1, xmm0      /* xmm1(0)=rinvsix */
+       movaps  xmm2, xmm1      /* zero everything else in xmm2 */
+       mulss   xmm2, xmm2      /* xmm2=rinvtwelve */
+
+       mulss   xmm1, [esp + c6]
+       mulss   xmm2, [esp + c12]
+       movaps  xmm4, xmm2
+       subss   xmm4, xmm1      /* vnbtot=vnb12-vnb6 */
+       addps   xmm4, [esp + vnbtot]
+       mulss   xmm1, [esp + six]
+       mulss   xmm2, [esp + twelve]    
+       movaps  [esp + vnbtot], xmm4
+       subss   xmm2, xmm1      /* fsD+fsR */
+       addps   xmm2, xmm3      /* fsC+fsD+fsR */
+
+       addps   xmm3, [esp + vctot]
+       mulps   xmm0, xmm2      /* total fscal */
+       movaps  [esp + vctot], xmm3     
+
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxOO]
+       mulps   xmm1, [esp + dyOO]
+       mulps   xmm2, [esp + dzOO]
+       /* initial update for j forces */
+       xorps   xmm3, xmm3
+       xorps   xmm4, xmm4
+       xorps   xmm5, xmm5
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixO]
+       addps   xmm1, [esp + fiyO]
+       addps   xmm2, [esp + fizO]
+       movaps  [esp + fixO], xmm0
+       movaps  [esp + fiyO], xmm1
+       movaps  [esp + fizO], xmm2
+
+       
+       /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+       movaps  xmm0, [esp + ixH1]
+       movaps  xmm1, [esp + iyH1]
+       movaps  xmm2, [esp + izH1]      
+       movaps  xmm3, [esp + ixH2] 
+       movaps  xmm4, [esp + iyH2] 
+       movaps  xmm5, [esp + izH2] 
+       subps   xmm0, [esp + jxO]
+       subps   xmm1, [esp + jyO]
+       subps   xmm2, [esp + jzO]
+       subps   xmm3, [esp + jxO]
+       subps   xmm4, [esp + jyO]
+       subps   xmm5, [esp + jzO]
+       movaps [esp + dxH1O], xmm0
+       movaps [esp + dyH1O], xmm1
+       movaps [esp + dzH1O], xmm2
+       movaps [esp + dxH2O], xmm3
+       movaps [esp + dyH2O], xmm4
+       movaps [esp + dzH2O], xmm5
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       mulps xmm3, xmm3
+       mulps xmm4, xmm4
+       mulps xmm5, xmm5
+       addps xmm0, xmm1
+       addps xmm4, xmm3
+       addps xmm0, xmm2        /* have rsqH1 in xmm0 */
+       addps xmm4, xmm5        /* have rsqH2 in xmm4 */
+
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1   /* do coulomb interaction */
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinv H1 - j water */
+       mulps   xmm7, [esp + half] /* rinv H2 - j water */ 
+
+       /* assemble charges in xmm6 */
+       xorps   xmm6, xmm6
+       /* do coulomb interaction */
+       movaps  xmm0, xmm3
+       movss   xmm6, [esp + qqOH]
+       movaps  xmm4, xmm7
+       movhps  xmm6, [esp + qqHH]
+       mulps   xmm0, xmm0      /* rinvsq */
+       mulps   xmm4, xmm4      /* rinvsq */
+       mulps   xmm3, xmm6      /* vcoul */
+       mulps   xmm7, xmm6      /* vcoul */
+       movaps  xmm2, xmm3
+       addps   xmm2, xmm7      /* total vcoul */
+       mulps   xmm0, xmm3      /* fscal */
+       
+       addps   xmm2, [esp + vctot]
+       mulps   xmm7, xmm4      /* fscal */
+       movaps  [esp + vctot], xmm2
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxH1O]
+       mulps   xmm1, [esp + dyH1O]
+       mulps   xmm2, [esp + dzH1O]
+       /* update forces H1 - j water */
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH1]
+       addps   xmm1, [esp + fiyH1]
+       addps   xmm2, [esp + fizH1]
+       movaps  [esp + fixH1], xmm0
+       movaps  [esp + fiyH1], xmm1
+       movaps  [esp + fizH1], xmm2
+       /* do forces H2 - j water */
+       movaps xmm0, xmm7
+       movaps xmm1, xmm7
+       movaps xmm2, xmm7
+       mulps   xmm0, [esp + dxH2O]
+       mulps   xmm1, [esp + dyH2O]
+       mulps   xmm2, [esp + dzH2O]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       mov     esi, [ebp + faction]
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH2]
+       addps   xmm1, [esp + fiyH2]
+       addps   xmm2, [esp + fizH2]
+       movaps  [esp + fixH2], xmm0
+       movaps  [esp + fiyH2], xmm1
+       movaps  [esp + fizH2], xmm2
+
+       /* update j water forces from local variables */
+       movlps  xmm0, [esi + eax*4]
+       movlps  xmm1, [esi + eax*4 + 12]
+       movhps  xmm1, [esi + eax*4 + 24]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       movaps  xmm6, xmm5
+       movaps  xmm7, xmm5
+       shufps  xmm6, xmm6, 0b10
+       shufps  xmm7, xmm7, 0b11
+       addss   xmm5, [esi + eax*4 + 8]
+       addss   xmm6, [esi + eax*4 + 20]
+       addss   xmm7, [esi + eax*4 + 32]
+       movss   [esi + eax*4 + 8], xmm5
+       movss   [esi + eax*4 + 20], xmm6
+       movss   [esi + eax*4 + 32], xmm7
+       movaps   xmm5, xmm3
+       unpcklps xmm3, xmm4
+       unpckhps xmm5, xmm4
+       addps    xmm0, xmm3
+       addps    xmm1, xmm5
+       movlps  [esi + eax*4], xmm0 
+       movlps  [esi + eax*4 + 12], xmm1 
+       movhps  [esi + eax*4 + 24], xmm1 
+       
+       dec dword ptr [esp + innerk]
+       jz    i1130_updateouterdata
+       jmp   i1130_single_loop
+i1130_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO] 
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz i1130_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp i1130_outer
+i1130_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 1492
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl2120_sse
+       .type inl2120_sse,@function
+inl2120_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           krf,            60      
+.equ           crf,            64      
+.equ           type,           68
+.equ           ntype,          72
+.equ           nbfp,           76      
+.equ           Vnb,            80      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,             0
+.equ           iyO,            16
+.equ           izO,            32
+.equ           ixH1,           48
+.equ           iyH1,           64
+.equ           izH1,           80
+.equ           ixH2,           96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           iqO,            144 
+.equ           iqH,            160 
+.equ           dxO,            176
+.equ           dyO,            192
+.equ           dzO,            208     
+.equ           dxH1,           224
+.equ           dyH1,           240
+.equ           dzH1,           256     
+.equ           dxH2,           272
+.equ           dyH2,           288
+.equ           dzH2,           304     
+.equ           qqO,            320
+.equ           qqH,            336
+.equ           c6,             352
+.equ           c12,            368
+.equ           six,            384
+.equ           twelve,         400              
+.equ           vctot,          416
+.equ           vnbtot,         432
+.equ           fixO,           448
+.equ           fiyO,           464
+.equ           fizO,           480
+.equ           fixH1,          496
+.equ           fiyH1,          512
+.equ           fizH1,          528
+.equ           fixH2,          544
+.equ           fiyH2,          560
+.equ           fizH2,          576
+.equ           fjx,            592
+.equ           fjy,            608
+.equ           fjz,            624
+.equ           half,           640
+.equ           three,          656
+.equ           two,            672
+.equ           krf,            688
+.equ           crf,            704
+.equ           krsqO,          720
+.equ           krsqH1,         736
+.equ           krsqH2,         752                     
+.equ           is3,            768
+.equ           ii3,            772
+.equ           ntia,           776     
+.equ           innerjjnr,      780
+.equ           innerk,         784
+.equ           salign,         788                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 792            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movups xmm4, [sse_two]
+       movss xmm5, [ebp + krf]
+       movss xmm6, [ebp + crf]
+
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+       movaps [esp + two], xmm4
+       shufps xmm5, xmm5, 0
+       shufps xmm6, xmm6, 0
+       movaps [esp + krf], xmm5
+       movaps [esp + crf], xmm6
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, [edx + ebx*4 + 4]   
+       movss xmm5, [ebp + facel]
+       mulss  xmm3, xmm5
+       mulss  xmm4, xmm5
+
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       movaps [esp + iqO], xmm3
+       movaps [esp + iqH], xmm4
+       
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       mov   [esp + ntia], ecx         
+.i2120_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i2120_unroll_loop
+       jmp   .i2120_odd_inner
+.i2120_unroll_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movaps xmm4, xmm3            /* and in xmm4 */
+       mulps  xmm3, [esp + iqO]
+       mulps  xmm4, [esp + iqH]
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps  [esp + qqO], xmm3
+       movaps  [esp + qqH], xmm4
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ixO-izO to xmm4-xmm6 */
+       movaps xmm4, [esp + ixO]
+       movaps xmm5, [esp + iyO]
+       movaps xmm6, [esp + izO]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxO], xmm4
+       movaps [esp + dyO], xmm5
+       movaps [esp + dzO], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       movaps xmm7, xmm4
+       /* rsqO in xmm7 */
+
+       /* move ixH1-izH1 to xmm4-xmm6 */
+       movaps xmm4, [esp + ixH1]
+       movaps xmm5, [esp + iyH1]
+       movaps xmm6, [esp + izH1]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxH1], xmm4
+       movaps [esp + dyH1], xmm5
+       movaps [esp + dzH1], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm6, xmm5
+       addps xmm6, xmm4
+       /* rsqH1 in xmm6 */
+
+       /* move ixH2-izH2 to xmm3-xmm5 */ 
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+
+       /* calc dr */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+
+       /* store dr */
+       movaps [esp + dxH2], xmm3
+       movaps [esp + dyH2], xmm4
+       movaps [esp + dzH2], xmm5
+       /* square it */
+       mulps xmm3,xmm3
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       addps xmm5, xmm4
+       addps xmm5, xmm3
+       /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+       movaps xmm0, xmm5
+       movaps xmm1, xmm6
+       movaps xmm2, xmm7
+
+       mulps  xmm0, [esp + krf]        
+       mulps  xmm1, [esp + krf]        
+       mulps  xmm2, [esp + krf]        
+
+       movaps [esp + krsqH2], xmm0
+       movaps [esp + krsqH1], xmm1
+       movaps [esp + krsqO], xmm2
+       
+       /* start with rsqO - seed in xmm2 */    
+       rsqrtps xmm2, xmm7
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm7      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm7, xmm4      /* rinvO in xmm7 */
+       /* rsqH1 - seed in xmm2 */
+       rsqrtps xmm2, xmm6
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm6      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm6, xmm4      /* rinvH1 in xmm6 */
+       /* rsqH2 - seed in xmm2 */
+       rsqrtps xmm2, xmm5
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm5      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm5, xmm4      /* rinvH2 in xmm5 */
+
+       /* do O interactions */
+       movaps  xmm4, xmm7      
+       mulps   xmm4, xmm4      /* xmm7=rinv, xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm3, xmm2
+       subps  xmm3, xmm1       /* vnb=vnb12-vnb6 */            
+       addps  xmm3, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1       /* nb part of fs */ 
+
+       movaps xmm0, xmm7
+       movaps xmm1, [esp + krsqO]
+       addps  xmm0, xmm1
+       mulps  xmm1, [esp + two]
+       subps  xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+       subps  xmm7, xmm1
+       mulps  xmm0, [esp + qqO]
+       mulps  xmm7, [esp + qqO]
+       addps  xmm2, xmm7
+
+       mulps  xmm4, xmm2       /* total fsO in xmm4 */
+
+       addps  xmm0, [esp + vctot]
+       movaps [esp + vnbtot], xmm3
+       movaps [esp + vctot], xmm0
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update O forces */
+       movaps xmm3, [esp + fixO]
+       movaps xmm4, [esp + fiyO]
+       movaps xmm7, [esp + fizO]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixO], xmm3
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm7
+       /* update j forces with water O */
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H1 interactions */
+       movaps  xmm4, xmm6      
+       mulps   xmm4, xmm4      /* xmm6=rinv, xmm4=rinvsq */
+       movaps  xmm7, xmm6
+       movaps  xmm0, [esp + krsqH1]
+       addps   xmm6, xmm0      /* xmm6=rinv+krsq */
+       mulps   xmm0, [esp + two]
+       subps   xmm6, [esp + crf]
+       subps   xmm7, xmm0      /* xmm7=rinv-2*krsq */
+       mulps   xmm6, [esp + qqH] /* vcoul */
+       mulps   xmm7, [esp + qqH]
+       mulps  xmm4, xmm7               /* total fsH1 in xmm4 */
+       
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dxH1]
+       movaps xmm1, [esp + dyH1]
+       movaps xmm2, [esp + dzH1]
+       movaps [esp + vctot], xmm6
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H1 forces */
+       movaps xmm3, [esp + fixH1]
+       movaps xmm4, [esp + fiyH1]
+       movaps xmm7, [esp + fizH1]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH1], xmm3
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm7
+       /* update j forces with water H1 */
+       addps  xmm0, [esp + fjx]
+       addps  xmm1, [esp + fjy]
+       addps  xmm2, [esp + fjz]
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H2 interactions */
+       movaps  xmm4, xmm5      
+       mulps   xmm4, xmm4      /* xmm5=rinv, xmm4=rinvsq */
+       movaps  xmm7, xmm5
+       movaps  xmm0, [esp + krsqH2]
+       addps   xmm5, xmm0      /* xmm5=rinv+krsq */
+       mulps   xmm0, [esp + two]
+       subps   xmm5, [esp + crf]
+       subps   xmm7, xmm0      /* xmm7=rinv-2*krsq */
+       mulps   xmm5, [esp + qqH] /* vcoul */
+       mulps   xmm7, [esp + qqH]
+       mulps  xmm4, xmm7               /* total fsH2 in xmm4 */
+       
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm0, [esp + dxH2]
+       movaps xmm1, [esp + dyH2]
+       movaps xmm2, [esp + dzH2]
+       movaps [esp + vctot], xmm5
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H2 forces */
+       movaps xmm3, [esp + fixH2]
+       movaps xmm4, [esp + fiyH2]
+       movaps xmm7, [esp + fizH2]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH2], xmm3
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm7
+
+       mov edi, [ebp +faction]
+       /* update j forces */
+       addps xmm0, [esp + fjx]
+       addps xmm1, [esp + fjy]
+       addps xmm2, [esp + fjz]
+
+       movlps xmm4, [edi + eax*4]
+       movlps xmm7, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm7, [edi + edx*4]
+       
+       movaps xmm3, xmm4
+       shufps xmm3, xmm7, 0b10001000
+       shufps xmm4, xmm7, 0b11011101                         
+       /* xmm3 has fjx, xmm4 has fjy */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       /* unpack them back for storing */
+       movaps xmm7, xmm3
+       unpcklps xmm7, xmm4
+       unpckhps xmm3, xmm4     
+       movlps [edi + eax*4], xmm7
+       movlps [edi + ecx*4], xmm3
+       movhps [edi + ebx*4], xmm7
+       movhps [edi + edx*4], xmm3
+       /* finally z forces */
+       movss  xmm0, [edi + eax*4 + 8]
+       movss  xmm1, [edi + ebx*4 + 8]
+       movss  xmm3, [edi + ecx*4 + 8]
+       movss  xmm4, [edi + edx*4 + 8]
+       subss  xmm0, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm1, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm3, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm4, xmm2
+       movss  [edi + eax*4 + 8], xmm0
+       movss  [edi + ebx*4 + 8], xmm1
+       movss  [edi + ecx*4 + 8], xmm3
+       movss  [edi + edx*4 + 8], xmm4
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i2120_odd_inner
+       jmp   .i2120_unroll_loop
+.i2120_odd_inner:      
+       add   [esp + innerk],  4
+       jnz   .i2120_odd_loop
+       jmp   .i2120_updateouterdata
+.i2120_odd_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       xorps xmm4, xmm4
+       movss xmm4, [esp + iqO]
+       mov esi, [ebp + charge] 
+       movhps xmm4, [esp + iqH]     
+       movss xmm3, [esi + eax*4]       /* charge in xmm3 */
+       shufps xmm3, xmm3, 0
+       mulps xmm3, xmm4
+       movaps [esp + qqO], xmm3        /* use oxygen qq for storage */
+
+       xorps xmm6, xmm6
+       mov esi, [ebp + type]
+       mov ebx, [esi + eax*4]
+       mov esi, [ebp + nbfp]
+       shl ebx, 1      
+       add ebx, [esp + ntia]
+       movlps xmm6, [esi + ebx*4]
+       movaps xmm7, xmm6
+       shufps xmm6, xmm6, 0b11111100
+       shufps xmm7, xmm7, 0b11111101
+       movaps [esp + c6], xmm6
+       movaps [esp + c12], xmm7
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+       
+       /* move j coords to xmm0-xmm2 */
+       movss xmm0, [esi + eax*4]
+       movss xmm1, [esi + eax*4 + 4]
+       movss xmm2, [esi + eax*4 + 8]
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       
+       movss xmm3, [esp + ixO]
+       movss xmm4, [esp + iyO]
+       movss xmm5, [esp + izO]
+               
+       movlps xmm6, [esp + ixH1]
+       movlps xmm7, [esp + ixH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm3, xmm6
+       movlps xmm6, [esp + iyH1]
+       movlps xmm7, [esp + iyH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm4, xmm6
+       movlps xmm6, [esp + izH1]
+       movlps xmm7, [esp + izH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm5, xmm6
+
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       
+       movaps [esp + dxO], xmm3
+       movaps [esp + dyO], xmm4
+       movaps [esp + dzO], xmm5
+
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       /* rsq in xmm4 */
+
+       movaps xmm0, xmm4
+       mulps xmm0, [esp + krf]
+       movaps [esp + krsqO], xmm0
+       
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+       movaps xmm1, xmm4
+       mulss  xmm1, xmm4
+       mulss  xmm1, xmm4       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulss  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subss  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulss  xmm1, [esp + six]
+       mulss  xmm2, [esp + twelve]
+       subss  xmm2, xmm1
+
+       movaps xmm1, xmm0       /* xmm1=r */inv
+       movaps xmm3, [esp + krsqO]
+       addps  xmm0, xmm3       /* xmm0=rinv+krsq */
+       mulps  xmm3, [esp + two]
+       subps  xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+       subps  xmm1, xmm3       /* xmm1=rinv-2*krsq */
+       mulps  xmm0, [esp + qqO]        /* xmm0=vcoul */
+       mulps  xmm1, [esp + qqO]        /* xmm1=coul part of fs */
+
+       addps xmm2, xmm1        /* total fs */
+       
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       addps  xmm0, [esp + vctot]
+       movaps [esp + vctot], xmm0
+       
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       movss  xmm3, [esp + fixO]       
+       movss  xmm4, [esp + fiyO]       
+       movss  xmm5, [esp + fizO]       
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esp + fixO], xmm3       
+       movss  [esp + fiyO], xmm4       
+       movss  [esp + fizO], xmm5       /* updated the O force now do the H's */
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       shufps xmm3, xmm3, 0b11100110   /* shift right */
+       shufps xmm4, xmm4, 0b11100110
+       shufps xmm5, xmm5, 0b11100110
+       addss  xmm3, [esp + fixH1]
+       addss  xmm4, [esp + fiyH1]
+       addss  xmm5, [esp + fizH1]
+       movss  [esp + fixH1], xmm3      
+       movss  [esp + fiyH1], xmm4      
+       movss  [esp + fizH1], xmm5      /* updated the H1 force */
+
+       mov edi, [ebp + faction]
+       shufps xmm3, xmm3, 0b11100111   /* shift right */
+       shufps xmm4, xmm4, 0b11100111
+       shufps xmm5, xmm5, 0b11100111
+       addss  xmm3, [esp + fixH2]
+       addss  xmm4, [esp + fiyH2]
+       addss  xmm5, [esp + fizH2]
+       movss  [esp + fixH2], xmm3      
+       movss  [esp + fiyH2], xmm4      
+       movss  [esp + fizH2], xmm5      /* updated the H2 force */
+
+       /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+       xorps  xmm5, xmm5
+       movaps xmm3, xmm0
+       movlps xmm6, [edi + eax*4]
+       movss  xmm7, [edi + eax*4 + 8]
+       unpcklps xmm3, xmm1
+       movlhps  xmm3, xmm5     
+       unpckhps xmm0, xmm1             
+       addps    xmm0, xmm3
+       movhlps  xmm3, xmm0     
+       addps    xmm0, xmm3     /* x,y sum in xmm0 */
+
+       movhlps  xmm1, xmm2
+       addss    xmm2, xmm1
+       shufps   xmm1, xmm1, 1 
+       addss    xmm2, xmm1    /* z sum in xmm2 */
+       subps    xmm6, xmm0
+       subss    xmm7, xmm2
+       
+       movlps [edi + eax*4],     xmm6
+       movss  [edi + eax*4 + 8], xmm7
+
+       dec dword ptr [esp + innerk]
+       jz    .i2120_updateouterdata
+       jmp   .i2120_odd_loop
+.i2120_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO]
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       mov   edx, [ebp + gid]  
+       mov   edx, [edx]
+       add   [ebp + gid],  4   
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+        
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i2120_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i2120_outer
+.i2120_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 792
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+       
+.globl inl2130_sse
+       .type inl2130_sse,@function
+inl2130_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           krf,            60
+.equ           crf,            64
+.equ           type,           68
+.equ           ntype,          72
+.equ           nbfp,           76      
+.equ           Vnb,            80
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           jxO,            144
+.equ           jyO,            160
+.equ           jzO,            176
+.equ           jxH1,           192
+.equ           jyH1,           208
+.equ           jzH1,           224
+.equ           jxH2,           240
+.equ           jyH2,           256
+.equ           jzH2,           272
+.equ           dxOO,           288
+.equ           dyOO,           304
+.equ           dzOO,           320     
+.equ           dxOH1,          336
+.equ           dyOH1,          352
+.equ           dzOH1,          368     
+.equ           dxOH2,          384
+.equ           dyOH2,          400
+.equ           dzOH2,          416     
+.equ           dxH1O,          432
+.equ           dyH1O,          448
+.equ           dzH1O,          464     
+.equ           dxH1H1,         480
+.equ           dyH1H1,         496
+.equ           dzH1H1,         512     
+.equ           dxH1H2,         528
+.equ           dyH1H2,         544
+.equ           dzH1H2,         560     
+.equ           dxH2O,          576
+.equ           dyH2O,          592
+.equ           dzH2O,          608     
+.equ           dxH2H1,         624
+.equ           dyH2H1,         640
+.equ           dzH2H1,         656     
+.equ           dxH2H2,         672
+.equ           dyH2H2,         688
+.equ           dzH2H2,         704
+.equ           qqOO,           720
+.equ           qqOH,           736
+.equ           qqHH,           752
+.equ           c6,             768
+.equ           c12,            784
+.equ           six,            800
+.equ           twelve,         816              
+.equ           vctot,          832
+.equ           vnbtot,         848
+.equ           fixO,           864
+.equ           fiyO,           880
+.equ           fizO,           896
+.equ           fixH1,          912
+.equ           fiyH1,          928
+.equ           fizH1,          944
+.equ           fixH2,          960
+.equ           fiyH2,          976
+.equ           fizH2,          992
+.equ           fjxO,          1008
+.equ           fjyO,          1024
+.equ           fjzO,          1040
+.equ           fjxH1,         1056
+.equ           fjyH1,         1072
+.equ           fjzH1,         1088
+.equ           fjxH2,         1104
+.equ           fjyH2,         1120
+.equ           fjzH2,         1136
+.equ           half,          1152
+.equ           three,         1168
+.equ           rsqOO,         1184
+.equ           rsqOH1,        1200
+.equ           rsqOH2,        1216
+.equ           rsqH1O,        1232
+.equ           rsqH1H1,       1248
+.equ           rsqH1H2,       1264
+.equ           rsqH2O,        1280
+.equ           rsqH2H1,       1296
+.equ           rsqH2H2,       1312
+.equ           rinvOO,        1328
+.equ           rinvOH1,       1344
+.equ           rinvOH2,       1360
+.equ           rinvH1O,       1376
+.equ           rinvH1H1,      1392
+.equ           rinvH1H2,      1408
+.equ           rinvH2O,       1424
+.equ           rinvH2H1,      1440
+.equ           rinvH2H2,      1456
+.equ           two,           1472
+.equ           krf,           1488     
+.equ           crf,           1504
+.equ           is3,           1520
+.equ           ii3,           1524
+.equ           innerjjnr,     1528
+.equ           innerk,        1532
+.equ           salign,        1536                                                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 1540           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm2, [sse_six]
+       movups xmm3, [sse_twelve]
+       movups xmm4, [sse_two]
+       movss xmm5, [ebp + krf]
+       movss xmm6, [ebp + crf]
+       
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + six],  xmm2
+       movaps [esp + twelve], xmm3
+       movaps [esp + two], xmm4
+       shufps xmm5, xmm5, 0
+       shufps xmm6, xmm6, 0
+       movaps [esp + krf], xmm5
+       movaps [esp + crf], xmm6
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, xmm3        
+       movss xmm5, [edx + ebx*4 + 4]   
+       movss xmm6, [ebp + facel]
+       mulss  xmm3, xmm3
+       mulss  xmm4, xmm5
+       mulss  xmm5, xmm5
+       mulss  xmm3, xmm6
+       mulss  xmm4, xmm6
+       mulss  xmm5, xmm6
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + qqOO], xmm3
+       movaps [esp + qqOH], xmm4
+       movaps [esp + qqHH], xmm5
+               
+       xorps xmm0, xmm0
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       mov   edx, ecx
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       add   edx, ecx
+       mov   eax, [ebp + nbfp]
+       movlps xmm0, [eax + edx*4] 
+       movaps xmm1, xmm0
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0b01010101
+       movaps [esp + c6], xmm0
+       movaps [esp + c12], xmm1
+
+.i2130_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx  
+       
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i2130_unroll_loop
+       jmp   .i2130_single_check
+.i2130_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4] 
+       mov   ecx, [edx + 8]
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+       
+       /* move j coordinates to local temp variables */
+       movlps xmm2, [esi + eax*4]
+       movlps xmm3, [esi + eax*4 + 12]
+       movlps xmm4, [esi + eax*4 + 24]
+
+       movlps xmm5, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 12]
+       movlps xmm7, [esi + ebx*4 + 24]
+
+       movhps xmm2, [esi + ecx*4]
+       movhps xmm3, [esi + ecx*4 + 12]
+       movhps xmm4, [esi + ecx*4 + 24]
+
+       movhps xmm5, [esi + edx*4]
+       movhps xmm6, [esi + edx*4 + 12]
+       movhps xmm7, [esi + edx*4 + 24]
+
+       /* current state: */    
+       /* xmm2= jxOa  jyOa  jxOc  jyOc */
+       /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+       /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+       /* xmm5= jxOb  jyOb  jxOd  jyOd */
+       /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+       /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+       
+       movaps xmm0, xmm2
+       movaps xmm1, xmm3
+       unpcklps xmm0, xmm5     /* xmm0= jxOa  jxOb  jyOa  jyOb */
+       unpcklps xmm1, xmm6     /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+       unpckhps xmm2, xmm5     /* xmm2= jxOc  jxOd  jyOc  jyOd */
+       unpckhps xmm3, xmm6     /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+       movaps xmm5, xmm4
+       movaps   xmm6, xmm0
+       unpcklps xmm4, xmm7     /* xmm4= jxH2a jxH2b jyH2a jyH2b */             
+       unpckhps xmm5, xmm7     /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+       movaps   xmm7, xmm1
+       movlhps  xmm0, xmm2     /* xmm0= jxOa  jxOb  jxOc  jxOd */
+       movaps [esp + jxO], xmm0
+       movhlps  xmm2, xmm6     /* xmm2= jyOa  jyOb  jyOc  jyOd */
+       movaps [esp + jyO], xmm2
+       movlhps  xmm1, xmm3
+       movaps [esp + jxH1], xmm1
+       movhlps  xmm3, xmm7
+       movaps   xmm6, xmm4
+       movaps [esp + jyH1], xmm3
+       movlhps  xmm4, xmm5
+       movaps [esp + jxH2], xmm4
+       movhlps  xmm5, xmm6
+       movaps [esp + jyH2], xmm5
+
+       movss  xmm0, [esi + eax*4 + 8]
+       movss  xmm1, [esi + eax*4 + 20]
+       movss  xmm2, [esi + eax*4 + 32]
+
+       movss  xmm3, [esi + ecx*4 + 8]
+       movss  xmm4, [esi + ecx*4 + 20]
+       movss  xmm5, [esi + ecx*4 + 32]
+
+       movhps xmm0, [esi + ebx*4 + 4]
+       movhps xmm1, [esi + ebx*4 + 16]
+       movhps xmm2, [esi + ebx*4 + 28]
+       
+       movhps xmm3, [esi + edx*4 + 4]
+       movhps xmm4, [esi + edx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 28]
+       
+       shufps xmm0, xmm3, 0b11001100
+       shufps xmm1, xmm4, 0b11001100
+       shufps xmm2, xmm5, 0b11001100
+       movaps [esp + jzO],  xmm0
+       movaps [esp + jzH1],  xmm1
+       movaps [esp + jzH2],  xmm2
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixO]
+       movaps xmm4, [esp + iyO]
+       movaps xmm5, [esp + izO]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxOH1], xmm3
+       movaps [esp + dyOH1], xmm4
+       movaps [esp + dzOH1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOO], xmm0
+       movaps [esp + rsqOH1], xmm3
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       subps  xmm3, [esp + jxO]
+       subps  xmm4, [esp + jyO]
+       subps  xmm5, [esp + jzO]
+       movaps [esp + dxOH2], xmm0
+       movaps [esp + dyOH2], xmm1
+       movaps [esp + dzOH2], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1O], xmm3
+       movaps [esp + dyH1O], xmm4
+       movaps [esp + dzH1O], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOH2], xmm0
+       movaps [esp + rsqH1O], xmm3
+
+       movaps xmm0, [esp + ixH1]
+       movaps xmm1, [esp + iyH1]
+       movaps xmm2, [esp + izH1]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH1]
+       subps  xmm1, [esp + jyH1]
+       subps  xmm2, [esp + jzH1]
+       subps  xmm3, [esp + jxH2]
+       subps  xmm4, [esp + jyH2]
+       subps  xmm5, [esp + jzH2]
+       movaps [esp + dxH1H1], xmm0
+       movaps [esp + dyH1H1], xmm1
+       movaps [esp + dzH1H1], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1H2], xmm3
+       movaps [esp + dyH1H2], xmm4
+       movaps [esp + dzH1H2], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqH1H1], xmm0
+       movaps [esp + rsqH1H2], xmm3
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxH2O], xmm0
+       movaps [esp + dyH2O], xmm1
+       movaps [esp + dzH2O], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH2H1], xmm3
+       movaps [esp + dyH2H1], xmm4
+       movaps [esp + dzH2H1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       movaps [esp + rsqH2O], xmm0
+       movaps [esp + rsqH2H1], xmm4
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       movaps [esp + dxH2H2], xmm0
+       movaps [esp + dyH2H2], xmm1
+       movaps [esp + dzH2H2], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2
+       movaps [esp + rsqH2H2], xmm0
+               
+       /* start doing invsqrt use rsq values in xmm0, xmm4 */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinvH2H2 */
+       mulps   xmm7, [esp + half] /* rinvH2H1 */
+       movaps  [esp + rinvH2H2], xmm3
+       movaps  [esp + rinvH2H1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOO]
+       rsqrtps xmm5, [esp + rsqOH1]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOO]
+       mulps   xmm5, [esp + rsqOH1]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOO], xmm3
+       movaps  [esp + rinvOH1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOH2]
+       rsqrtps xmm5, [esp + rsqH1O]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOH2]
+       mulps   xmm5, [esp + rsqH1O]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOH2], xmm3
+       movaps  [esp + rinvH1O], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH1H1]
+       rsqrtps xmm5, [esp + rsqH1H2]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqH1H1]
+       mulps   xmm5, [esp + rsqH1H2]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvH1H1], xmm3
+       movaps  [esp + rinvH1H2], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH2O]
+       movaps  xmm2, xmm1
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, [esp + rsqH2O]
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2
+       mulps   xmm3, [esp + half] 
+       movaps  [esp + rinvH2O], xmm3
+
+       /* start with OO interaction */
+       movaps xmm0, [esp + rinvOO]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]
+       mulps  xmm0, xmm0
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       mulps  xmm5, [esp + rsqOO] /* xmm5=krsq */
+       movaps xmm6, xmm5
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+       subps  xmm6, [esp + crf]
+       
+       mulps  xmm6, [esp + qqOO] /* xmm6=voul=qq*(rinv+krsq-crf) */
+       mulps xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOO] /* xmm7 = coul part of fscal */
+       
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm1, [esp + c6] 
+       mulps  xmm2, [esp + c12]        
+       movaps xmm3, xmm2
+       subps  xmm3, xmm1       /* xmm3=vnb12-vnb6 */
+       addps  xmm3, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       movaps [esp + vnbtot], xmm3
+       subps  xmm2, xmm1
+       addps  xmm2, xmm7
+       addps  xmm6, [esp + vctot] /* local vctot summation variable */
+       mulps  xmm0, xmm2
+       
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOO]
+       mulps xmm1, [esp + dyOO]
+       mulps xmm2, [esp + dzOO]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H1 interaction */
+       movaps xmm0, [esp + rinvOH1]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqOH1] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=rinv+krsq */
+       mulps  xmm0, xmm0
+       subps  xmm4, [esp + crf]
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH1  */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH1]
+       mulps xmm1, [esp + dyOH1]
+       mulps xmm2, [esp + dzOH1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H2 interaction */ 
+       movaps xmm0, [esp + rinvOH2]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqOH2] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       mulps xmm0, xmm0
+       subps  xmm4, [esp + crf]
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH2]
+       mulps xmm1, [esp + dyOH2]
+       mulps xmm2, [esp + dzOH2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* H1-O interaction */
+       movaps xmm0, [esp + rinvH1O]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH1O] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=rinv+krsq */
+       mulps xmm0, xmm0
+       subps  xmm4, [esp + crf]
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH1O]
+       mulps xmm1, [esp + dyH1O]
+       mulps xmm2, [esp + dzH1O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H1 interaction */
+       movaps xmm0, [esp + rinvH1H1]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH1H1] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH1H1]
+       mulps xmm1, [esp + dyH1H1]
+       mulps xmm2, [esp + dzH1H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H2 interaction */
+       movaps xmm0, [esp + rinvH1H2]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH1H2] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       mulps xmm0, xmm0
+       subps  xmm4, [esp + crf]
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH1H2]
+       mulps xmm1, [esp + dyH1H2]
+       mulps xmm2, [esp + dzH1H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H2-O interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH2O] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH2O]
+       mulps xmm1, [esp + dyH2O]
+       mulps xmm2, [esp + dzH2O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H1 interaction */
+       movaps xmm0, [esp + rinvH2H1]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH2H1] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH2H1]
+       mulps xmm1, [esp + dyH2H1]
+       mulps xmm2, [esp + dzH2H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H2 interaction */
+       movaps xmm0, [esp + rinvH2H2]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH2H2] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm1, xmm0
+       movaps [esp + vctot], xmm6
+       movaps xmm2, xmm0
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH2H2]
+       mulps xmm1, [esp + dyH2H2]
+       mulps xmm2, [esp + dzH2H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       mov edi, [ebp +faction]
+               
+       /* Did all interactions - now update j forces */
+       /* 4 j waters with three atoms each - first do a & b j particles */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpcklps xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjxOb  fjyOb */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOb  fjyOb */ 
+       unpcklps xmm1, xmm2        /* xmm1= fjzOa  fjxH1a fjzOb  fjxH1b */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpcklps xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1b fjzH1b */
+       unpcklps xmm5, xmm6        /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjzOa  fjxH1a */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOb  fjyOb  fjzOb  fjxH1b */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+       movups   xmm1, [edi + eax*4]
+       movups   xmm2, [edi + eax*4 + 16]
+       movups   xmm5, [edi + ebx*4]
+       movups   xmm6, [edi + ebx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + eax*4 + 32]
+       movss    xmm3, [edi + ebx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm7, xmm7, 1
+       
+       movups   [edi + eax*4],     xmm1
+       movups   [edi + eax*4 + 16],xmm2
+       movups   [edi + ebx*4],     xmm5
+       movups   [edi + ebx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + eax*4 + 32], xmm0
+       movss    [edi + ebx*4 + 32], xmm3       
+
+       /* then do the second pair (c & d) */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpckhps xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjxOd  fjyOd */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOd  fjyOd */ 
+       unpckhps xmm1, xmm2        /* xmm1= fjzOc  fjxH1c fjzOd  fjxH1d */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpckhps xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1d fjzH1d */     
+       unpckhps xmm5, xmm6        /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjzOc  fjxH1c */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOd  fjyOd  fjzOd  fjxH1d */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c  */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+       movups   xmm1, [edi + ecx*4]
+       movups   xmm2, [edi + ecx*4 + 16]
+       movups   xmm5, [edi + edx*4]
+       movups   xmm6, [edi + edx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + ecx*4 + 32]
+       movss    xmm3, [edi + edx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm4, xmm4, 0b10
+       shufps   xmm7, xmm7, 0b11
+       movups   [edi + ecx*4],     xmm1
+       movups   [edi + ecx*4 + 16],xmm2
+       movups   [edi + edx*4],     xmm5
+       movups   [edi + edx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + ecx*4 + 32], xmm0
+       movss    [edi + edx*4 + 32], xmm3       
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i2130_single_check
+       jmp   .i2130_unroll_loop
+.i2130_single_check:
+       add   [esp + innerk],  4
+       jnz   .i2130_single_loop
+       jmp   .i2130_updateouterdata
+.i2130_single_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+
+       /* fetch j coordinates */
+       xorps xmm3, xmm3
+       xorps xmm4, xmm4
+       xorps xmm5, xmm5
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + eax*4 + 4]
+       movss xmm5, [esi + eax*4 + 8]
+
+       movlps xmm6, [esi + eax*4 + 12]
+       movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+       /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+       movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+       shufps xmm6, xmm6, 0b11011000    /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+       movlhps xmm3, xmm6              /* xmm3= jxO   0  jxH1 jxH2 */
+       movaps  xmm0, [esp + ixO]     
+       movaps  xmm1, [esp + iyO]
+       movaps  xmm2, [esp + izO]       
+       shufps  xmm4, xmm6, 0b11100100 /* xmm4= jyO   0   jyH1 jyH2 */
+       shufps xmm5, xmm7, 0b11000100  /* xmm5= jzO   0   jzH1 jzH2 */
+       /* store all j coordinates in jO */ 
+       movaps [esp + jxO], xmm3
+       movaps [esp + jyO], xmm4
+       movaps [esp + jzO], xmm5
+       subps  xmm0, xmm3
+       subps  xmm1, xmm4
+       subps  xmm2, xmm5
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2        /* have rsq in xmm0 */
+
+       movaps xmm6, xmm0
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       mulps   xmm6, [esp + krf] /* xmm6=krsq */
+       movaps  xmm2, xmm1
+       movaps  xmm7, xmm6
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, xmm0
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2                                                      
+       mulps   xmm3, [esp + half] /* rinv iO - j water */
+
+       addps   xmm6, xmm3      /* xmm6=rinv+krsq */
+       mulps   xmm7, [esp + two]
+       subps  xmm6, [esp + crf]        /* xmm6=rinv+krsq-crf */
+       
+       xorps   xmm1, xmm1
+       movaps  xmm0, xmm3
+       subps   xmm3, xmm7      /* xmm3=rinv-2*krsq */
+       xorps   xmm4, xmm4
+       mulps   xmm0, xmm0      /* xmm0=rinvsq */
+       /* fetch charges to xmm4 (temporary) */
+       movss   xmm4, [esp + qqOO]
+       movss   xmm1, xmm0
+       movhps  xmm4, [esp + qqOH]
+       mulss   xmm1, xmm0
+
+       mulps xmm6, xmm4        /* vcoul */ 
+       mulps xmm3, xmm4        /* coul part of fs */ 
+       
+       mulss   xmm1, xmm0      /* xmm1(0)=rinvsix */
+       movaps  xmm2, xmm1      /* zero everything else in xmm2 */
+       mulss   xmm2, xmm2      /* xmm2=rinvtwelve */
+
+       mulss   xmm1, [esp + c6]
+       mulss   xmm2, [esp + c12]
+       movaps  xmm4, xmm2
+       subss   xmm4, xmm1      /* vnbtot=vnb12-vnb6 */
+       addps   xmm4, [esp + vnbtot]
+       mulss   xmm1, [esp + six]
+       mulss   xmm2, [esp + twelve]    
+       movaps  [esp + vnbtot], xmm4
+       subss   xmm2, xmm1      /* fsD+fsR */
+       addps   xmm2, xmm3      /* fsC+fsD+fsR */
+
+       addps   xmm6, [esp + vctot]
+       mulps   xmm0, xmm2      /* total fscal */
+       movaps  [esp + vctot], xmm6     
+
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxOO]
+       mulps   xmm1, [esp + dyOO]
+       mulps   xmm2, [esp + dzOO]
+       
+       /* initial update for j forces */
+       xorps   xmm3, xmm3
+       xorps   xmm4, xmm4
+       xorps   xmm5, xmm5
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixO]
+       addps   xmm1, [esp + fiyO]
+       addps   xmm2, [esp + fizO]
+       movaps  [esp + fixO], xmm0
+       movaps  [esp + fiyO], xmm1
+       movaps  [esp + fizO], xmm2
+
+       
+       /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+       movaps  xmm0, [esp + ixH1]
+       movaps  xmm1, [esp + iyH1]
+       movaps  xmm2, [esp + izH1]      
+       movaps  xmm3, [esp + ixH2] 
+       movaps  xmm4, [esp + iyH2] 
+       movaps  xmm5, [esp + izH2] 
+       subps   xmm0, [esp + jxO]
+       subps   xmm1, [esp + jyO]
+       subps   xmm2, [esp + jzO]
+       subps   xmm3, [esp + jxO]
+       subps   xmm4, [esp + jyO]
+       subps   xmm5, [esp + jzO]
+       movaps [esp + dxH1O], xmm0
+       movaps [esp + dyH1O], xmm1
+       movaps [esp + dzH1O], xmm2
+       movaps [esp + dxH2O], xmm3
+       movaps [esp + dyH2O], xmm4
+       movaps [esp + dzH2O], xmm5
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       mulps xmm3, xmm3
+       mulps xmm4, xmm4
+       mulps xmm5, xmm5
+       addps xmm0, xmm1
+       addps xmm4, xmm3
+       addps xmm0, xmm2        /* have rsqH1 in xmm0 */
+       addps xmm4, xmm5        /* have rsqH2 in xmm4 */
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinv H1 - j water */
+       mulps   xmm7, [esp + half] /* rinv H2 - j water */ 
+
+       mulps xmm0, [esp + krf] /* krsq */
+       mulps xmm4, [esp + krf] /* krsq */ 
+
+
+       /* assemble charges in xmm6 */
+       xorps   xmm6, xmm6
+       movss   xmm6, [esp + qqOH]
+       movhps  xmm6, [esp + qqHH]
+       movaps  xmm1, xmm0
+       movaps  xmm5, xmm4
+       addps   xmm0, xmm3      /* krsq+rinv */
+       addps   xmm4, xmm7      /* krsq+rinv */
+       subps xmm0, [esp + crf]
+       subps xmm4, [esp + crf]
+       mulps   xmm1, [esp + two]
+       mulps   xmm5, [esp + two]
+       mulps   xmm0, xmm6      /* vcoul */
+       mulps   xmm4, xmm6      /* vcoul */
+       addps   xmm4, xmm0              
+       addps   xmm4, [esp + vctot]
+       movaps  [esp + vctot], xmm4
+       movaps  xmm0, xmm3
+       movaps  xmm4, xmm7
+       mulps   xmm3, xmm3
+       mulps   xmm7, xmm7
+       subps   xmm0, xmm1
+       subps   xmm4, xmm5
+       mulps   xmm0, xmm6
+       mulps   xmm4, xmm6
+       mulps   xmm0, xmm3      /* fscal */
+       mulps   xmm7, xmm4      /* fscal */
+       
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxH1O]
+       mulps   xmm1, [esp + dyH1O]
+       mulps   xmm2, [esp + dzH1O]
+       /* update forces H1 - j water */
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH1]
+       addps   xmm1, [esp + fiyH1]
+       addps   xmm2, [esp + fizH1]
+       movaps  [esp + fixH1], xmm0
+       movaps  [esp + fiyH1], xmm1
+       movaps  [esp + fizH1], xmm2
+       /* do forces H2 - j water */
+       movaps xmm0, xmm7
+       movaps xmm1, xmm7
+       movaps xmm2, xmm7
+       mulps   xmm0, [esp + dxH2O]
+       mulps   xmm1, [esp + dyH2O]
+       mulps   xmm2, [esp + dzH2O]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       mov     esi, [ebp + faction]
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH2]
+       addps   xmm1, [esp + fiyH2]
+       addps   xmm2, [esp + fizH2]
+       movaps  [esp + fixH2], xmm0
+       movaps  [esp + fiyH2], xmm1
+       movaps  [esp + fizH2], xmm2
+
+       /* update j water forces from local variables */
+       movlps  xmm0, [esi + eax*4]
+       movlps  xmm1, [esi + eax*4 + 12]
+       movhps  xmm1, [esi + eax*4 + 24]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       movaps  xmm6, xmm5
+       movaps  xmm7, xmm5
+       shufps  xmm6, xmm6, 0b10
+       shufps  xmm7, xmm7, 0b11
+       addss   xmm5, [esi + eax*4 + 8]
+       addss   xmm6, [esi + eax*4 + 20]
+       addss   xmm7, [esi + eax*4 + 32]
+       movss   [esi + eax*4 + 8], xmm5
+       movss   [esi + eax*4 + 20], xmm6
+       movss   [esi + eax*4 + 32], xmm7
+       movaps   xmm5, xmm3
+       unpcklps xmm3, xmm4
+       unpckhps xmm5, xmm4
+       addps    xmm0, xmm3
+       addps    xmm1, xmm5
+       movlps  [esi + eax*4], xmm0 
+       movlps  [esi + eax*4 + 12], xmm1 
+       movhps  [esi + eax*4 + 24], xmm1 
+       
+       dec dword ptr [esp + innerk]
+       jz    .i2130_updateouterdata
+       jmp   .i2130_single_loop
+.i2130_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO] 
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+ 
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i2130_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i2130_outer
+.i2130_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 1540
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+       
+
+.globl inl2020_sse
+       .type inl2020_sse,@function
+inl2020_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           krf,            60      
+.equ           crf,            64      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           iqO,            144 
+.equ           iqH,            160 
+.equ           dxO,            176
+.equ           dyO,            192
+.equ           dzO,            208     
+.equ           dxH1,           224
+.equ           dyH1,           240
+.equ           dzH1,           256     
+.equ           dxH2,           272
+.equ           dyH2,           288
+.equ           dzH2,           304     
+.equ           qqO,            320
+.equ           qqH,            336
+.equ           vctot,          352
+.equ           fixO,           384
+.equ           fiyO,           400
+.equ           fizO,           416
+.equ           fixH1,          432
+.equ           fiyH1,          448
+.equ           fizH1,          464
+.equ           fixH2,          480
+.equ           fiyH2,          496
+.equ           fizH2,          512
+.equ           fjx,            528
+.equ           fjy,            544
+.equ           fjz,            560
+.equ           half,           576
+.equ           three,          592
+.equ           two,            608
+.equ           krf,            624
+.equ           crf,            640
+.equ           krsqO,          656
+.equ           krsqH1,         672
+.equ           krsqH2,         688                     
+.equ           is3,            704
+.equ           ii3,            708
+.equ           innerjjnr,      712
+.equ           innerk,         716
+.equ           salign,         720                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 724            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm4, [sse_two]
+       movss xmm5, [ebp + krf]
+       movss xmm6, [ebp + crf]
+
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + two], xmm4
+       shufps xmm5, xmm5, 0
+       shufps xmm6, xmm6, 0
+       movaps [esp + krf], xmm5
+       movaps [esp + crf], xmm6
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, [edx + ebx*4 + 4]   
+       movss xmm5, [ebp + facel]
+       mulss  xmm3, xmm5
+       mulss  xmm4, xmm5
+
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       movaps [esp + iqO], xmm3
+       movaps [esp + iqH], xmm4
+                       
+.i2020_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i2020_unroll_loop
+       jmp   .i2020_odd_inner
+.i2020_unroll_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movaps xmm4, xmm3            /* and in xmm4 */
+       mulps  xmm3, [esp + iqO]
+       mulps  xmm4, [esp + iqH]
+
+       movaps  [esp + qqO], xmm3
+       movaps  [esp + qqH], xmm4
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ixO-izO to xmm4-xmm6 */
+       movaps xmm4, [esp + ixO]
+       movaps xmm5, [esp + iyO]
+       movaps xmm6, [esp + izO]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxO], xmm4
+       movaps [esp + dyO], xmm5
+       movaps [esp + dzO], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       movaps xmm7, xmm4
+       /* rsqO in xmm7 */
+
+       /* move ixH1-izH1 to xmm4-xmm6 */
+       movaps xmm4, [esp + ixH1]
+       movaps xmm5, [esp + iyH1]
+       movaps xmm6, [esp + izH1]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxH1], xmm4
+       movaps [esp + dyH1], xmm5
+       movaps [esp + dzH1], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm6, xmm5
+       addps xmm6, xmm4
+       /* rsqH1 in xmm6 */
+
+       /* move ixH2-izH2 to xmm3-xmm5 */ 
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+
+       /* calc dr */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+
+       /* store dr */
+       movaps [esp + dxH2], xmm3
+       movaps [esp + dyH2], xmm4
+       movaps [esp + dzH2], xmm5
+       /* square it */
+       mulps xmm3,xmm3
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       addps xmm5, xmm4
+       addps xmm5, xmm3
+       /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+       movaps xmm0, xmm5
+       movaps xmm1, xmm6
+       movaps xmm2, xmm7
+
+       mulps  xmm0, [esp + krf]        
+       mulps  xmm1, [esp + krf]        
+       mulps  xmm2, [esp + krf]        
+
+       movaps [esp + krsqH2], xmm0
+       movaps [esp + krsqH1], xmm1
+       movaps [esp + krsqO], xmm2
+       
+       /* start with rsqO - seed in xmm2 */    
+       rsqrtps xmm2, xmm7
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm7      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm7, xmm4      /* rinvO in xmm7 */
+       /* rsqH1 - seed in xmm2 */
+       rsqrtps xmm2, xmm6
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm6      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm6, xmm4      /* rinvH1 in xmm6 */
+       /* rsqH2 - seed in xmm2 */
+       rsqrtps xmm2, xmm5
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm5      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  xmm5, xmm4      /* rinvH2 in xmm5 */
+
+       /* do O interactions */
+       movaps  xmm4, xmm7      
+       mulps   xmm4, xmm4      /* xmm7=rinv, xmm4=rinvsq */
+
+       movaps xmm0, xmm7
+       movaps xmm1, [esp + krsqO]
+       addps  xmm0, xmm1
+       subps  xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+       mulps  xmm1, [esp + two]
+       subps  xmm7, xmm1
+       mulps  xmm0, [esp + qqO]
+       mulps  xmm7, [esp + qqO]
+
+       mulps  xmm4, xmm7       /* total fsO in xmm4 */
+
+       addps  xmm0, [esp + vctot]
+       movaps [esp + vctot], xmm0
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update O forces */
+       movaps xmm3, [esp + fixO]
+       movaps xmm4, [esp + fiyO]
+       movaps xmm7, [esp + fizO]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixO], xmm3
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm7
+       /* update j forces with water O */
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H1 interactions */
+       movaps  xmm4, xmm6      
+       mulps   xmm4, xmm4      /* xmm6=rinv, xmm4=rinvsq */
+       movaps  xmm7, xmm6
+       movaps  xmm0, [esp + krsqH1]
+       addps   xmm6, xmm0      /* xmm6=rinv+krsq */
+       subps   xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+       mulps   xmm0, [esp + two]
+       subps   xmm7, xmm0      /* xmm7=rinv-2*krsq */
+       mulps   xmm6, [esp + qqH] /* vcoul */
+       mulps   xmm7, [esp + qqH]
+       mulps  xmm4, xmm7               /* total fsH1 in xmm4 */
+       
+       addps  xmm6, [esp + vctot]
+
+       movaps xmm0, [esp + dxH1]
+       movaps xmm1, [esp + dyH1]
+       movaps xmm2, [esp + dzH1]
+       movaps [esp + vctot], xmm6
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H1 forces */
+       movaps xmm3, [esp + fixH1]
+       movaps xmm4, [esp + fiyH1]
+       movaps xmm7, [esp + fizH1]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH1], xmm3
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm7
+       /* update j forces with water H1 */
+       addps  xmm0, [esp + fjx]
+       addps  xmm1, [esp + fjy]
+       addps  xmm2, [esp + fjz]
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* H2 interactions */
+       movaps  xmm4, xmm5      
+       mulps   xmm4, xmm4      /* xmm5=rinv, xmm4=rinvsq */
+       movaps  xmm7, xmm5
+       movaps  xmm0, [esp + krsqH2]
+       addps   xmm5, xmm0      /* xmm6=rinv+krsq */
+       subps   xmm5, [esp + crf] /* xmm5=rinv+krsq-crf */
+       mulps   xmm0, [esp + two]
+       subps   xmm7, xmm0      /* xmm7=rinv-2*krsq */
+       mulps   xmm5, [esp + qqH] /* vcoul */
+       mulps   xmm7, [esp + qqH]
+       mulps  xmm4, xmm7               /* total fsH2 in xmm4 */
+       
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm0, [esp + dxH2]
+       movaps xmm1, [esp + dyH2]
+       movaps xmm2, [esp + dzH2]
+       movaps [esp + vctot], xmm5
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H2 forces */
+       movaps xmm3, [esp + fixH2]
+       movaps xmm4, [esp + fiyH2]
+       movaps xmm7, [esp + fizH2]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH2], xmm3
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm7
+
+       mov edi, [ebp +faction]
+       /* update j forces */
+       addps xmm0, [esp + fjx]
+       addps xmm1, [esp + fjy]
+       addps xmm2, [esp + fjz]
+
+       movlps xmm4, [edi + eax*4]
+       movlps xmm7, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm7, [edi + edx*4]
+       
+       movaps xmm3, xmm4
+       shufps xmm3, xmm7, 0b10001000
+       shufps xmm4, xmm7, 0b11011101                         
+       /* xmm3 has fjx, xmm4 has fjy */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       /* unpack them back for storing */
+       movaps xmm7, xmm3
+       unpcklps xmm7, xmm4
+       unpckhps xmm3, xmm4     
+       movlps [edi + eax*4], xmm7
+       movlps [edi + ecx*4], xmm3
+       movhps [edi + ebx*4], xmm7
+       movhps [edi + edx*4], xmm3
+       /* finally z forces */
+       movss  xmm0, [edi + eax*4 + 8]
+       movss  xmm1, [edi + ebx*4 + 8]
+       movss  xmm3, [edi + ecx*4 + 8]
+       movss  xmm4, [edi + edx*4 + 8]
+       subss  xmm0, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm1, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm3, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm4, xmm2
+       movss  [edi + eax*4 + 8], xmm0
+       movss  [edi + ebx*4 + 8], xmm1
+       movss  [edi + ecx*4 + 8], xmm3
+       movss  [edi + edx*4 + 8], xmm4
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i2020_odd_inner
+       jmp   .i2020_unroll_loop
+.i2020_odd_inner:      
+       add   [esp + innerk],  4
+       jnz   .i2020_odd_loop
+       jmp   .i2020_updateouterdata
+.i2020_odd_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       xorps xmm4, xmm4
+       movss xmm4, [esp + iqO]
+       mov esi, [ebp + charge] 
+       movhps xmm4, [esp + iqH]     
+       movss xmm3, [esi + eax*4]       /* charge in xmm3 */
+       shufps xmm3, xmm3, 0
+       mulps xmm3, xmm4
+       movaps [esp + qqO], xmm3        /* use oxygen qq for storage */
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+       
+       /* move j coords to xmm0-xmm2 */
+       movss xmm0, [esi + eax*4]
+       movss xmm1, [esi + eax*4 + 4]
+       movss xmm2, [esi + eax*4 + 8]
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       
+       movss xmm3, [esp + ixO]
+       movss xmm4, [esp + iyO]
+       movss xmm5, [esp + izO]
+               
+       movlps xmm6, [esp + ixH1]
+       movlps xmm7, [esp + ixH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm3, xmm6
+       movlps xmm6, [esp + iyH1]
+       movlps xmm7, [esp + iyH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm4, xmm6
+       movlps xmm6, [esp + izH1]
+       movlps xmm7, [esp + izH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm5, xmm6
+
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       
+       movaps [esp + dxO], xmm3
+       movaps [esp + dyO], xmm4
+       movaps [esp + dzO], xmm5
+
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       /* rsq in xmm4 */
+
+       movaps xmm0, xmm4
+       mulps xmm0, [esp + krf]
+       movaps [esp + krsqO], xmm0
+       
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm4       /* xmm4=rinvsq */
+
+       movaps xmm1, xmm0       /* xmm1=r */inv
+       movaps xmm3, [esp + krsqO]
+       addps  xmm0, xmm3       /* xmm0=rinv+krsq */
+       subps  xmm0, [esp + crf] /* xmm0=rinv+krsq-crf */
+       mulps  xmm3, [esp + two]
+       subps  xmm1, xmm3       /* xmm1=rinv-2*krsq */
+       mulps  xmm0, [esp + qqO]        /* xmm0=vcoul */
+       mulps  xmm1, [esp + qqO]        /* xmm1=coul part of fs */
+
+       
+       mulps  xmm4, xmm1       /* xmm4=total fscal */
+       addps  xmm0, [esp + vctot]
+       movaps [esp + vctot], xmm0
+       
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       movss  xmm3, [esp + fixO]       
+       movss  xmm4, [esp + fiyO]       
+       movss  xmm5, [esp + fizO]       
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esp + fixO], xmm3       
+       movss  [esp + fiyO], xmm4       
+       movss  [esp + fizO], xmm5       /* updated the O force now do the H's */
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       shufps xmm3, xmm3, 0b11100110   /* shift right */
+       shufps xmm4, xmm4, 0b11100110
+       shufps xmm5, xmm5, 0b11100110
+       addss  xmm3, [esp + fixH1]
+       addss  xmm4, [esp + fiyH1]
+       addss  xmm5, [esp + fizH1]
+       movss  [esp + fixH1], xmm3      
+       movss  [esp + fiyH1], xmm4      
+       movss  [esp + fizH1], xmm5      /* updated the H1 force */
+
+       mov edi, [ebp + faction]
+       shufps xmm3, xmm3, 0b11100111   /* shift right */
+       shufps xmm4, xmm4, 0b11100111
+       shufps xmm5, xmm5, 0b11100111
+       addss  xmm3, [esp + fixH2]
+       addss  xmm4, [esp + fiyH2]
+       addss  xmm5, [esp + fizH2]
+       movss  [esp + fixH2], xmm3      
+       movss  [esp + fiyH2], xmm4      
+       movss  [esp + fizH2], xmm5      /* updated the H2 force */
+
+       /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+       xorps  xmm5, xmm5
+       movaps xmm3, xmm0
+       movlps xmm6, [edi + eax*4]
+       movss  xmm7, [edi + eax*4 + 8]
+       unpcklps xmm3, xmm1
+       movlhps  xmm3, xmm5     
+       unpckhps xmm0, xmm1             
+       addps    xmm0, xmm3
+       movhlps  xmm3, xmm0     
+       addps    xmm0, xmm3     /* x,y sum in xmm0 */
+
+       movhlps  xmm1, xmm2
+       addss    xmm2, xmm1
+       shufps   xmm1, xmm1, 1 
+       addss    xmm2, xmm1    /* z sum in xmm2 */
+       subps    xmm6, xmm0
+       subss    xmm7, xmm2
+       
+       movlps [edi + eax*4],     xmm6
+       movss  [edi + eax*4 + 8], xmm7
+
+       dec dword ptr [esp + innerk]
+       jz    .i2020_updateouterdata
+       jmp   .i2020_odd_loop
+.i2020_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO]
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       mov   edx, [ebp + gid]  
+       mov   edx, [edx]
+       add   [ebp + gid],  4   
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+        
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i2020_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i2020_outer
+.i2020_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 724
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+       
+.globl inl2030_sse
+       .type inl2030_sse,@function
+inl2030_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           krf,            60
+.equ           crf,            64
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           jxO,            144
+.equ           jyO,            160
+.equ           jzO,            176
+.equ           jxH1,           192
+.equ           jyH1,           208
+.equ           jzH1,           224
+.equ           jxH2,           240
+.equ           jyH2,           256
+.equ           jzH2,           272
+.equ           dxOO,           288
+.equ           dyOO,           304
+.equ           dzOO,           320     
+.equ           dxOH1,          336
+.equ           dyOH1,          352
+.equ           dzOH1,          368     
+.equ           dxOH2,          384
+.equ           dyOH2,          400
+.equ           dzOH2,          416     
+.equ           dxH1O,          432
+.equ           dyH1O,          448
+.equ           dzH1O,          464     
+.equ           dxH1H1,         480
+.equ           dyH1H1,         496
+.equ           dzH1H1,         512     
+.equ           dxH1H2,         528
+.equ           dyH1H2,         544
+.equ           dzH1H2,         560     
+.equ           dxH2O,          576
+.equ           dyH2O,          592
+.equ           dzH2O,          608     
+.equ           dxH2H1,         624
+.equ           dyH2H1,         640
+.equ           dzH2H1,         656     
+.equ           dxH2H2,         672
+.equ           dyH2H2,         688
+.equ           dzH2H2,         704
+.equ           qqOO,           720
+.equ           qqOH,           736
+.equ           qqHH,           752
+.equ           vctot,          768
+.equ           fixO,           784
+.equ           fiyO,           800
+.equ           fizO,           816
+.equ           fixH1,          832
+.equ           fiyH1,          848
+.equ           fizH1,          864
+.equ           fixH2,          880
+.equ           fiyH2,          896
+.equ           fizH2,          912
+.equ           fjxO,           928
+.equ           fjyO,           944
+.equ           fjzO,           960
+.equ           fjxH1,          976
+.equ           fjyH1,          992
+.equ           fjzH1,         1008
+.equ           fjxH2,         1024
+.equ           fjyH2,         1040
+.equ           fjzH2,         1056
+.equ           half,          1072
+.equ           three,         1088
+.equ           rsqOO,         1104
+.equ           rsqOH1,        1120
+.equ           rsqOH2,        1136
+.equ           rsqH1O,        1152
+.equ           rsqH1H1,       1168
+.equ           rsqH1H2,       1184
+.equ           rsqH2O,        1200
+.equ           rsqH2H1,       1216
+.equ           rsqH2H2,       1232
+.equ           rinvOO,        1248
+.equ           rinvOH1,       1264
+.equ           rinvOH2,       1280
+.equ           rinvH1O,       1296
+.equ           rinvH1H1,      1312
+.equ           rinvH1H2,      1328
+.equ           rinvH2O,       1344
+.equ           rinvH2H1,      1360
+.equ           rinvH2H2,      1376
+.equ           two,           1392
+.equ           krf,           1408     
+.equ           crf,           1424
+.equ           is3,           1440
+.equ           ii3,           1444
+.equ           innerjjnr,     1448
+.equ           innerk,        1452
+.equ           salign,        1456                                                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 1460           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_three]
+       movups xmm4, [sse_two]
+       movss xmm5, [ebp + krf]
+       movss xmm6, [ebp + crf]
+       
+       movaps [esp + half],  xmm0
+       movaps [esp + three], xmm1
+       movaps [esp + two], xmm4
+       shufps xmm5, xmm5, 0
+       shufps xmm6, xmm6, 0
+       movaps [esp + krf], xmm5
+       movaps [esp + crf], xmm6
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, xmm3        
+       movss xmm5, [edx + ebx*4 + 4]   
+       movss xmm6, [ebp + facel]
+       mulss  xmm3, xmm3
+       mulss  xmm4, xmm5
+       mulss  xmm5, xmm5
+       mulss  xmm3, xmm6
+       mulss  xmm4, xmm6
+       mulss  xmm5, xmm6
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + qqOO], xmm3
+       movaps [esp + qqOH], xmm4
+       movaps [esp + qqHH], xmm5
+       
+.i2030_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx  
+       
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i2030_unroll_loop
+       jmp   .i2030_single_check
+.i2030_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4] 
+       mov   ecx, [edx + 8]
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+       
+       /* move j coordinates to local temp variables */
+       movlps xmm2, [esi + eax*4]
+       movlps xmm3, [esi + eax*4 + 12]
+       movlps xmm4, [esi + eax*4 + 24]
+
+       movlps xmm5, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 12]
+       movlps xmm7, [esi + ebx*4 + 24]
+
+       movhps xmm2, [esi + ecx*4]
+       movhps xmm3, [esi + ecx*4 + 12]
+       movhps xmm4, [esi + ecx*4 + 24]
+
+       movhps xmm5, [esi + edx*4]
+       movhps xmm6, [esi + edx*4 + 12]
+       movhps xmm7, [esi + edx*4 + 24]
+
+       /* current state: */    
+       /* xmm2= jxOa  jyOa  jxOc  jyOc */
+       /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+       /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+       /* xmm5= jxOb  jyOb  jxOd  jyOd */
+       /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+       /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+       
+       movaps xmm0, xmm2
+       movaps xmm1, xmm3
+       unpcklps xmm0, xmm5     /* xmm0= jxOa  jxOb  jyOa  jyOb */
+       unpcklps xmm1, xmm6     /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+       unpckhps xmm2, xmm5     /* xmm2= jxOc  jxOd  jyOc  jyOd */
+       unpckhps xmm3, xmm6     /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+       movaps xmm5, xmm4
+       movaps   xmm6, xmm0
+       unpcklps xmm4, xmm7     /* xmm4= jxH2a jxH2b jyH2a jyH2b */             
+       unpckhps xmm5, xmm7     /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+       movaps   xmm7, xmm1
+       movlhps  xmm0, xmm2     /* xmm0= jxOa  jxOb  jxOc  jxOd */
+       movaps [esp + jxO], xmm0
+       movhlps  xmm2, xmm6     /* xmm2= jyOa  jyOb  jyOc  jyOd */
+       movaps [esp + jyO], xmm2
+       movlhps  xmm1, xmm3
+       movaps [esp + jxH1], xmm1
+       movhlps  xmm3, xmm7
+       movaps   xmm6, xmm4
+       movaps [esp + jyH1], xmm3
+       movlhps  xmm4, xmm5
+       movaps [esp + jxH2], xmm4
+       movhlps  xmm5, xmm6
+       movaps [esp + jyH2], xmm5
+
+       movss  xmm0, [esi + eax*4 + 8]
+       movss  xmm1, [esi + eax*4 + 20]
+       movss  xmm2, [esi + eax*4 + 32]
+
+       movss  xmm3, [esi + ecx*4 + 8]
+       movss  xmm4, [esi + ecx*4 + 20]
+       movss  xmm5, [esi + ecx*4 + 32]
+
+       movhps xmm0, [esi + ebx*4 + 4]
+       movhps xmm1, [esi + ebx*4 + 16]
+       movhps xmm2, [esi + ebx*4 + 28]
+       
+       movhps xmm3, [esi + edx*4 + 4]
+       movhps xmm4, [esi + edx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 28]
+       
+       shufps xmm0, xmm3, 0b11001100
+       shufps xmm1, xmm4, 0b11001100
+       shufps xmm2, xmm5, 0b11001100
+       movaps [esp + jzO],  xmm0
+       movaps [esp + jzH1],  xmm1
+       movaps [esp + jzH2],  xmm2
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixO]
+       movaps xmm4, [esp + iyO]
+       movaps xmm5, [esp + izO]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxOH1], xmm3
+       movaps [esp + dyOH1], xmm4
+       movaps [esp + dzOH1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOO], xmm0
+       movaps [esp + rsqOH1], xmm3
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       subps  xmm3, [esp + jxO]
+       subps  xmm4, [esp + jyO]
+       subps  xmm5, [esp + jzO]
+       movaps [esp + dxOH2], xmm0
+       movaps [esp + dyOH2], xmm1
+       movaps [esp + dzOH2], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1O], xmm3
+       movaps [esp + dyH1O], xmm4
+       movaps [esp + dzH1O], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOH2], xmm0
+       movaps [esp + rsqH1O], xmm3
+
+       movaps xmm0, [esp + ixH1]
+       movaps xmm1, [esp + iyH1]
+       movaps xmm2, [esp + izH1]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH1]
+       subps  xmm1, [esp + jyH1]
+       subps  xmm2, [esp + jzH1]
+       subps  xmm3, [esp + jxH2]
+       subps  xmm4, [esp + jyH2]
+       subps  xmm5, [esp + jzH2]
+       movaps [esp + dxH1H1], xmm0
+       movaps [esp + dyH1H1], xmm1
+       movaps [esp + dzH1H1], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1H2], xmm3
+       movaps [esp + dyH1H2], xmm4
+       movaps [esp + dzH1H2], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqH1H1], xmm0
+       movaps [esp + rsqH1H2], xmm3
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxH2O], xmm0
+       movaps [esp + dyH2O], xmm1
+       movaps [esp + dzH2O], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH2H1], xmm3
+       movaps [esp + dyH2H1], xmm4
+       movaps [esp + dzH2H1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       movaps [esp + rsqH2O], xmm0
+       movaps [esp + rsqH2H1], xmm4
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       movaps [esp + dxH2H2], xmm0
+       movaps [esp + dyH2H2], xmm1
+       movaps [esp + dzH2H2], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2
+       movaps [esp + rsqH2H2], xmm0
+               
+       /* start doing invsqrt use rsq values in xmm0, xmm4 */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinvH2H2 */
+       mulps   xmm7, [esp + half] /* rinvH2H1 */
+       movaps  [esp + rinvH2H2], xmm3
+       movaps  [esp + rinvH2H1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOO]
+       rsqrtps xmm5, [esp + rsqOH1]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOO]
+       mulps   xmm5, [esp + rsqOH1]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOO], xmm3
+       movaps  [esp + rinvOH1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOH2]
+       rsqrtps xmm5, [esp + rsqH1O]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOH2]
+       mulps   xmm5, [esp + rsqH1O]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOH2], xmm3
+       movaps  [esp + rinvH1O], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH1H1]
+       rsqrtps xmm5, [esp + rsqH1H2]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqH1H1]
+       mulps   xmm5, [esp + rsqH1H2]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvH1H1], xmm3
+       movaps  [esp + rinvH1H2], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH2O]
+       movaps  xmm2, xmm1
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, [esp + rsqH2O]
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2
+       mulps   xmm3, [esp + half] 
+       movaps  [esp + rinvH2O], xmm3
+
+       /* start with OO interaction */
+       movaps xmm0, [esp + rinvOO]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]
+       mulps  xmm0, xmm0       /* xmm0=rinvsq */
+
+       mulps  xmm5, [esp + rsqOO] /* xmm5=krsq */
+       movaps xmm6, xmm5
+       addps  xmm6, xmm7       /* xmm6=rinv+krsq */
+       subps  xmm6, [esp + crf]
+       mulps  xmm6, [esp + qqOO] /* xmm6=voul=qq*(rinv+krsq-crf) */
+       mulps xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOO] /* xmm7 = coul part of fscal */
+       
+       addps  xmm6, [esp + vctot] /* local vctot summation variable */
+       mulps  xmm0, xmm7
+       
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOO]
+       mulps xmm1, [esp + dyOO]
+       mulps xmm2, [esp + dzOO]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H1 interaction */
+       movaps xmm0, [esp + rinvOH1]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqOH1] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps  xmm0, xmm0
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH1  */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH1]
+       mulps xmm1, [esp + dyOH1]
+       mulps xmm2, [esp + dzOH1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H2 interaction */ 
+       movaps xmm0, [esp + rinvOH2]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqOH2] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH2]
+       mulps xmm1, [esp + dyOH2]
+       mulps xmm2, [esp + dzOH2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* H1-O interaction */
+       movaps xmm0, [esp + rinvH1O]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH1O] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH1O]
+       mulps xmm1, [esp + dyH1O]
+       mulps xmm2, [esp + dzH1O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H1 interaction */
+       movaps xmm0, [esp + rinvH1H1]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH1H1] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH1H1]
+       mulps xmm1, [esp + dyH1H1]
+       mulps xmm2, [esp + dzH1H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H2 interaction */
+       movaps xmm0, [esp + rinvH1H2]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH1H2] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH1H2]
+       mulps xmm1, [esp + dyH1H2]
+       mulps xmm2, [esp + dzH1H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H2-O interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH2O] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqOH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqOH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH2O]
+       mulps xmm1, [esp + dyH2O]
+       mulps xmm2, [esp + dzH2O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H1 interaction */
+       movaps xmm0, [esp + rinvH2H1]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH2H1] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH2H1]
+       mulps xmm1, [esp + dyH2H1]
+       mulps xmm2, [esp + dzH2H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H2 interaction */
+       movaps xmm0, [esp + rinvH2H2]
+       movaps xmm7, xmm0       /* xmm7=rinv */
+       movaps xmm5, [esp + krf]        
+       movaps xmm1, xmm0
+       mulps  xmm5, [esp + rsqH2H2] /* xmm5=krsq */
+       movaps xmm4, xmm5
+       addps  xmm4, xmm7       /* xmm4=r inv+krsq */
+       subps  xmm4, [esp + crf]
+       mulps xmm0, xmm0
+       mulps  xmm4, [esp + qqHH] /* xmm4=voul=qq*(rinv+krsq-crf) */
+       mulps  xmm5, [esp + two]
+       subps  xmm7, xmm5       /* xmm7=rinv-2*krsq */
+       mulps  xmm7, [esp + qqHH] /* xmm7 = coul part of fscal */
+       addps  xmm6, xmm4       /* add to local vctot */
+       mulps xmm0, xmm7        /* fsOH2 */
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0
+
+       movaps xmm1, xmm0
+       movaps [esp + vctot], xmm6
+       movaps xmm2, xmm0
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH2H2]
+       mulps xmm1, [esp + dyH2H2]
+       mulps xmm2, [esp + dzH2H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       mov edi, [ebp +faction]
+               
+       /* Did all interactions - now update j forces */
+       /* 4 j waters with three atoms each - first do a & b j particles */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpcklps xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjxOb  fjyOb */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOb  fjyOb */ 
+       unpcklps xmm1, xmm2        /* xmm1= fjzOa  fjxH1a fjzOb  fjxH1b */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpcklps xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1b fjzH1b */
+       unpcklps xmm5, xmm6        /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjzOa  fjxH1a */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOb  fjyOb  fjzOb  fjxH1b */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+       movups   xmm1, [edi + eax*4]
+       movups   xmm2, [edi + eax*4 + 16]
+       movups   xmm5, [edi + ebx*4]
+       movups   xmm6, [edi + ebx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + eax*4 + 32]
+       movss    xmm3, [edi + ebx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm7, xmm7, 1
+       
+       movups   [edi + eax*4],     xmm1
+       movups   [edi + eax*4 + 16],xmm2
+       movups   [edi + ebx*4],     xmm5
+       movups   [edi + ebx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + eax*4 + 32], xmm0
+       movss    [edi + ebx*4 + 32], xmm3       
+
+       /* then do the second pair (c & d) */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpckhps xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjxOd  fjyOd */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOd  fjyOd */ 
+       unpckhps xmm1, xmm2        /* xmm1= fjzOc  fjxH1c fjzOd  fjxH1d */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpckhps xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1d fjzH1d */     
+       unpckhps xmm5, xmm6        /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjzOc  fjxH1c */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOd  fjyOd  fjzOd  fjxH1d */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c  */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+       movups   xmm1, [edi + ecx*4]
+       movups   xmm2, [edi + ecx*4 + 16]
+       movups   xmm5, [edi + edx*4]
+       movups   xmm6, [edi + edx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + ecx*4 + 32]
+       movss    xmm3, [edi + edx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm4, xmm4, 0b10
+       shufps   xmm7, xmm7, 0b11
+       movups   [edi + ecx*4],     xmm1
+       movups   [edi + ecx*4 + 16],xmm2
+       movups   [edi + edx*4],     xmm5
+       movups   [edi + edx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + ecx*4 + 32], xmm0
+       movss    [edi + edx*4 + 32], xmm3       
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i2030_single_check
+       jmp   .i2030_unroll_loop
+.i2030_single_check:
+       add   [esp + innerk],  4
+       jnz   .i2030_single_loop
+       jmp   .i2030_updateouterdata
+.i2030_single_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+
+       /* fetch j coordinates */
+       xorps xmm3, xmm3
+       xorps xmm4, xmm4
+       xorps xmm5, xmm5
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + eax*4 + 4]
+       movss xmm5, [esi + eax*4 + 8]
+
+       movlps xmm6, [esi + eax*4 + 12]
+       movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+       /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+       movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+       shufps xmm6, xmm6, 0b11011000    /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+       movlhps xmm3, xmm6              /* xmm3= jxO   0  jxH1 jxH2 */
+       movaps  xmm0, [esp + ixO]     
+       movaps  xmm1, [esp + iyO]
+       movaps  xmm2, [esp + izO]       
+       shufps  xmm4, xmm6, 0b11100100 /* xmm4= jyO   0   jyH1 jyH2 */
+       shufps xmm5, xmm7, 0b11000100  /* xmm5= jzO   0   jzH1 jzH2 */
+       /* store all j coordinates in jO */ 
+       movaps [esp + jxO], xmm3
+       movaps [esp + jyO], xmm4
+       movaps [esp + jzO], xmm5
+       subps  xmm0, xmm3
+       subps  xmm1, xmm4
+       subps  xmm2, xmm5
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2        /* have rsq in xmm0 */
+
+       movaps xmm6, xmm0
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       mulps   xmm6, [esp + krf] /* xmm6=krsq */
+       movaps  xmm2, xmm1
+       movaps  xmm7, xmm6         /* xmm7=krsq */
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, xmm0
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2                                                      
+       mulps   xmm3, [esp + half] /* rinv iO - j water */
+
+
+       
+       addps   xmm6, xmm3      /* xmm6=rinv+krsq */
+       mulps   xmm7, [esp + two]
+       subps   xmm6, [esp + crf] /* xmm6=rinv+krsq-crf */
+       
+       xorps   xmm1, xmm1
+       movaps  xmm0, xmm3
+       subps   xmm3, xmm7      /* xmm3=rinv-2*krsq */
+       xorps   xmm4, xmm4
+       mulps   xmm0, xmm0      /* xmm0=rinvsq */
+       /* fetch charges to xmm4 (temporary) */
+       movss   xmm4, [esp + qqOO]
+       movhps  xmm4, [esp + qqOH]
+
+       mulps xmm6, xmm4        /* vcoul */ 
+       mulps xmm3, xmm4        /* coul part of fs */ 
+
+
+       addps   xmm6, [esp + vctot]
+       mulps   xmm0, xmm3      /* total fscal */
+       movaps  [esp + vctot], xmm6     
+
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxOO]
+       mulps   xmm1, [esp + dyOO]
+       mulps   xmm2, [esp + dzOO]
+       
+       /* initial update for j forces */
+       xorps   xmm3, xmm3
+       xorps   xmm4, xmm4
+       xorps   xmm5, xmm5
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixO]
+       addps   xmm1, [esp + fiyO]
+       addps   xmm2, [esp + fizO]
+       movaps  [esp + fixO], xmm0
+       movaps  [esp + fiyO], xmm1
+       movaps  [esp + fizO], xmm2
+
+       
+       /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+       movaps  xmm0, [esp + ixH1]
+       movaps  xmm1, [esp + iyH1]
+       movaps  xmm2, [esp + izH1]      
+       movaps  xmm3, [esp + ixH2] 
+       movaps  xmm4, [esp + iyH2] 
+       movaps  xmm5, [esp + izH2] 
+       subps   xmm0, [esp + jxO]
+       subps   xmm1, [esp + jyO]
+       subps   xmm2, [esp + jzO]
+       subps   xmm3, [esp + jxO]
+       subps   xmm4, [esp + jyO]
+       subps   xmm5, [esp + jzO]
+       movaps [esp + dxH1O], xmm0
+       movaps [esp + dyH1O], xmm1
+       movaps [esp + dzH1O], xmm2
+       movaps [esp + dxH2O], xmm3
+       movaps [esp + dyH2O], xmm4
+       movaps [esp + dzH2O], xmm5
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       mulps xmm3, xmm3
+       mulps xmm4, xmm4
+       mulps xmm5, xmm5
+       addps xmm0, xmm1
+       addps xmm4, xmm3
+       addps xmm0, xmm2        /* have rsqH1 in xmm0 */
+       addps xmm4, xmm5        /* have rsqH2 in xmm4 */
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinv H1 - j water */
+       mulps   xmm7, [esp + half] /* rinv H2 - j water */ 
+
+       mulps xmm0, [esp + krf] /* krsq */
+       mulps xmm4, [esp + krf] /* krsq */ 
+
+       /* assemble charges in xmm6 */
+       xorps   xmm6, xmm6
+       movss   xmm6, [esp + qqOH]
+       movhps  xmm6, [esp + qqHH]
+       movaps  xmm1, xmm0
+       movaps  xmm5, xmm4
+       addps   xmm0, xmm3      /* krsq+rinv */
+       addps   xmm4, xmm7      /* krsq+rinv */
+       subps   xmm0, [esp + crf]
+       subps   xmm4, [esp + crf]
+       mulps   xmm1, [esp + two]
+       mulps   xmm5, [esp + two]
+       mulps   xmm0, xmm6      /* vcoul */
+       mulps   xmm4, xmm6      /* vcoul */
+       addps   xmm4, xmm0              
+       addps   xmm4, [esp + vctot]
+       movaps  [esp + vctot], xmm4
+       movaps  xmm0, xmm3
+       movaps  xmm4, xmm7
+       mulps   xmm3, xmm3
+       mulps   xmm7, xmm7
+       subps   xmm0, xmm1
+       subps   xmm4, xmm5
+       mulps   xmm0, xmm6
+       mulps   xmm4, xmm6
+       mulps   xmm0, xmm3      /* fscal */
+       mulps   xmm7, xmm4      /* fscal */
+       
+       movaps  xmm1, xmm0
+       movaps  xmm2, xmm0
+       mulps   xmm0, [esp + dxH1O]
+       mulps   xmm1, [esp + dyH1O]
+       mulps   xmm2, [esp + dzH1O]
+       /* update forces H1 - j water */
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH1]
+       addps   xmm1, [esp + fiyH1]
+       addps   xmm2, [esp + fizH1]
+       movaps  [esp + fixH1], xmm0
+       movaps  [esp + fiyH1], xmm1
+       movaps  [esp + fizH1], xmm2
+       /* do forces H2 - j water */
+       movaps xmm0, xmm7
+       movaps xmm1, xmm7
+       movaps xmm2, xmm7
+       mulps   xmm0, [esp + dxH2O]
+       mulps   xmm1, [esp + dyH2O]
+       mulps   xmm2, [esp + dzH2O]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       mov     esi, [ebp + faction]
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH2]
+       addps   xmm1, [esp + fiyH2]
+       addps   xmm2, [esp + fizH2]
+       movaps  [esp + fixH2], xmm0
+       movaps  [esp + fiyH2], xmm1
+       movaps  [esp + fizH2], xmm2
+
+       /* update j water forces from local variables */
+       movlps  xmm0, [esi + eax*4]
+       movlps  xmm1, [esi + eax*4 + 12]
+       movhps  xmm1, [esi + eax*4 + 24]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       movaps  xmm6, xmm5
+       movaps  xmm7, xmm5
+       shufps  xmm6, xmm6, 0b10
+       shufps  xmm7, xmm7, 0b11
+       addss   xmm5, [esi + eax*4 + 8]
+       addss   xmm6, [esi + eax*4 + 20]
+       addss   xmm7, [esi + eax*4 + 32]
+       movss   [esi + eax*4 + 8], xmm5
+       movss   [esi + eax*4 + 20], xmm6
+       movss   [esi + eax*4 + 32], xmm7
+       movaps   xmm5, xmm3
+       unpcklps xmm3, xmm4
+       unpckhps xmm5, xmm4
+       addps    xmm0, xmm3
+       addps    xmm1, xmm5
+       movlps  [esi + eax*4], xmm0 
+       movlps  [esi + eax*4 + 12], xmm1 
+       movhps  [esi + eax*4 + 24], xmm1 
+       
+       dec dword ptr [esp + innerk]
+       jz    .i2030_updateouterdata
+       jmp   .i2030_single_loop
+.i2030_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO] 
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+ 
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7       
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i2030_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i2030_outer
+.i2030_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 1460
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+               
+
+.globl inl3000_sse
+       .type inl3000_sse,@function
+inl3000_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           tabscale,       60
+.equ           VFtab,          64
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96
+.equ           two,            112
+.equ           tsc,            128
+.equ           qq,             144     
+.equ           fs,             160
+.equ           vctot,          176
+.equ           fix,            192
+.equ           fiy,            208
+.equ           fiz,            224
+.equ           half,           240
+.equ           three,          256
+.equ           is3,            272
+.equ           ii3,            276
+.equ           innerjjnr,      280
+.equ           innerk,         284
+.equ           salign,         288                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 292            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three],  xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+.i3000_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3000_unroll_loop
+       jmp   .i3000_finish_inner
+.i3000_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       mulps  xmm3, xmm2
+
+       movaps [esp + qq], xmm3 
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+               
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3000_finish_inner
+       jmp   .i3000_unroll_loop
+.i3000_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3000_dopair
+       jmp   .i3000_checksingle
+.i3000_dopair: 
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov edi, [ebp + pos]    
+       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3000_checksingle:                            
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3000_dosingle
+       jmp    .i3000_updateouterdata
+.i3000_dosingle:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3000_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3000_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3000_outer
+.i3000_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 292
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl3010_sse
+       .type inl3010_sse,@function
+inl3010_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56
+.equ           tabscale,       60
+.equ           VFtab,          64
+.equ           nsatoms,        68      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96
+.equ           two,            112
+.equ           tsc,            128
+.equ           qq,             144     
+.equ           fscal,          160
+.equ           vctot,          176
+.equ           fix,            192
+.equ           fiy,            208
+.equ           fiz,            224
+.equ           half,           240
+.equ           three,          256
+.equ           is3,            272
+.equ           ii3,            276
+.equ           shX,            280
+.equ           shY,            284
+.equ           shZ,            288
+.equ           ntia,           292     
+.equ           innerjjnr0,     296
+.equ           innerk0,        300
+.equ           innerjjnr,      304
+.equ           innerk,         308
+.equ           salign,         312                                                     
+.equ           nscoul,         316
+.equ           solnr,          320                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 324            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three],  xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc], xmm3
+
+       add   [ebp + nsatoms],  8
+
+       /* assume we have at least one i particle - start directly */   
+.i3010_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+       movss [esp + shX], xmm0
+       movss [esp + shY], xmm1
+       movss [esp + shZ], xmm2
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   eax, [ebp + nsatoms]
+       mov   ecx, [eax]
+       add   [ebp + nsatoms],  12
+       mov   [esp + nscoul], ecx       
+
+       /* clear vctot */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       mov   [esp + solnr], ebx
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+
+       mov   ecx, [esp + nscoul]
+       cmp   ecx,  0
+       jnz  .i3010_mno_coul
+       jmp   .i3010_last_mno
+.i3010_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3010_unroll_coul_loop
+       jmp   .i3010_finish_coul_inner
+
+.i3010_unroll_coul_loop:       
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       mulps  xmm3, xmm2
+
+       movaps [esp + qq], xmm3 
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+               
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3010_finish_coul_inner
+       jmp   .i3010_unroll_coul_loop
+.i3010_finish_coul_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3010_dopair_coul
+       jmp   .i3010_checksingle_coul
+.i3010_dopair_coul:    
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov edi, [ebp + pos]    
+       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3010_checksingle_coul:                               
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3010_dosingle_coul
+       jmp    .i3010_updateouterdata_coul
+.i3010_dosingle_coul:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3010_updateouterdata_coul:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* loop back to mno */
+       dec  dword ptr [esp + nscoul]
+       jz  .i3010_last_mno
+       jmp .i3010_mno_coul
+       
+.i3010_last_mno:       
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3010_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3010_outer
+.i3010_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 324
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl3020_sse
+       .type inl3020_sse,@function
+inl3020_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           tabscale,       60      
+.equ           VFtab,          64      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           iqO,            144 
+.equ           iqH,            160 
+.equ           dxO,            176
+.equ           dyO,            192
+.equ           dzO,            208     
+.equ           dxH1,           224
+.equ           dyH1,           240
+.equ           dzH1,           256     
+.equ           dxH2,           272
+.equ           dyH2,           288
+.equ           dzH2,           304     
+.equ           qqO,            320
+.equ           qqH,            336
+.equ           rinvO,          352
+.equ           rinvH1,         368
+.equ           rinvH2,         384             
+.equ           rO,             400
+.equ           rH1,            416
+.equ           rH2,            432
+.equ           tsc,            448     
+.equ           two,            464
+.equ           vctot,          480
+.equ           fixO,           496
+.equ           fiyO,           512
+.equ           fizO,           528
+.equ           fixH1,          544
+.equ           fiyH1,          560
+.equ           fizH1,          576
+.equ           fixH2,          592
+.equ           fiyH2,          608
+.equ           fizH2,          624
+.equ           fjx,            640
+.equ           fjy,            656
+.equ           fjz,            672
+.equ           half,           688
+.equ           three,          704
+.equ           is3,            720
+.equ           ii3,            724
+.equ           innerjjnr,      728
+.equ           innerk,         732
+.equ           salign,         736                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 740            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp +tabscale]
+       
+       movaps [esp + half],  xmm0
+       movaps [esp + two],  xmm1
+       movaps [esp + three],  xmm2
+       shufps xmm3, xmm3, 0 
+       movaps [esp + tsc], xmm3
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, [edx + ebx*4 + 4]   
+       movss xmm5, [ebp + facel]
+       mulss  xmm3, xmm5
+       mulss  xmm4, xmm5
+
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       movaps [esp + iqO], xmm3
+       movaps [esp + iqH], xmm4
+       
+.i3020_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3020_unroll_loop
+       jmp   .i3020_odd_inner
+.i3020_unroll_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movaps xmm4, xmm3            /* and in xmm4 */
+       mulps  xmm3, [esp + iqO]
+       mulps  xmm4, [esp + iqH]
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps  [esp + qqO], xmm3
+       movaps  [esp + qqH], xmm4       
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ixO-izO to xmm4-xmm6 */
+       movaps xmm4, [esp + ixO]
+       movaps xmm5, [esp + iyO]
+       movaps xmm6, [esp + izO]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxO], xmm4
+       movaps [esp + dyO], xmm5
+       movaps [esp + dzO], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       movaps xmm7, xmm4
+       /* rsqO in xmm7 */
+
+       /* move ixH1-izH1 to xmm4-xmm6 */
+       movaps xmm4, [esp + ixH1]
+       movaps xmm5, [esp + iyH1]
+       movaps xmm6, [esp + izH1]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxH1], xmm4
+       movaps [esp + dyH1], xmm5
+       movaps [esp + dzH1], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm6, xmm5
+       addps xmm6, xmm4
+       /* rsqH1 in xmm6 */
+
+       /* move ixH2-izH2 to xmm3-xmm5 */ 
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+
+       /* calc dr */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+
+       /* store dr */
+       movaps [esp + dxH2], xmm3
+       movaps [esp + dyH2], xmm4
+       movaps [esp + dzH2], xmm5
+       /* square it */
+       mulps xmm3,xmm3
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       addps xmm5, xmm4
+       addps xmm5, xmm3
+       /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+       /* start with rsqO - seed to xmm2 */    
+       rsqrtps xmm2, xmm7
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm7      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvO], xmm4     /* rinvO in xmm4 */
+       mulps   xmm7, xmm4
+       movaps  [esp + rO], xmm7        
+
+       /* rsqH1 - seed in xmm2 */
+       rsqrtps xmm2, xmm6
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm6      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvH1], xmm4    /* rinvH1 in xmm4 */
+       mulps   xmm6, xmm4
+       movaps  [esp + rH1], xmm6
+
+       /* rsqH2 - seed to xmm2 */
+       rsqrtps xmm2, xmm5
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm5      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvH2], xmm4    /* rinvH2 in xmm4 */
+       mulps   xmm5, xmm4
+       movaps  [esp + rH2], xmm5
+
+       /* do O interactions */
+       /* rO is still in xmm7 */
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd mm0, eax   
+        movd mm1, ebx
+        movd mm2, ecx
+        movd mm3, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm0, xmm7 /* fijC=FF*qq */
+
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5 
+       xorps  xmm4, xmm4
+
+       mulps  xmm0, [esp + tsc]
+       mulps  xmm0, [esp + rinvO]      
+       subps  xmm4, xmm0
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4       /* tx in xmm0-xmm2 */
+
+       /* update O forces */
+       movaps xmm3, [esp + fixO]
+       movaps xmm4, [esp + fiyO]
+       movaps xmm7, [esp + fizO]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixO], xmm3
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm7
+       /* update j forces with water O */
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* Done with O interactions - now H1! */
+       movaps xmm7, [esp + rH1]
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm7, xmm0 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm7 fijC */
+        /* increment vcoul */
+       xorps  xmm4, xmm4
+        addps  xmm5, [esp + vctot]
+       mulps  xmm7, [esp + rinvH1]
+        movaps [esp + vctot], xmm5 
+       mulps  xmm7, [esp + tsc]
+       subps xmm4, xmm7
+
+       movaps xmm0, [esp + dxH1]
+       movaps xmm1, [esp + dyH1]
+       movaps xmm2, [esp + dzH1]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H1 forces */
+       movaps xmm3, [esp + fixH1]
+       movaps xmm4, [esp + fiyH1]
+       movaps xmm7, [esp + fizH1]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH1], xmm3
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm7
+       /* update j forces with water H1 */
+       addps  xmm0, [esp + fjx]
+       addps  xmm1, [esp + fjy]
+       addps  xmm2, [esp + fjz]
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* Done with H1, finally we do H2 interactions */
+       movaps xmm7, [esp + rH2]
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm7, xmm0 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul */
+       xorps  xmm4, xmm4
+        addps  xmm5, [esp + vctot]
+       mulps  xmm7, [esp + rinvH2]
+        movaps [esp + vctot], xmm5 
+       mulps  xmm7, [esp + tsc]
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dxH2]
+       movaps xmm1, [esp + dyH2]
+       movaps xmm2, [esp + dzH2]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+        movd eax, mm0   
+        movd ebx, mm1
+        movd ecx, mm2
+        movd edx, mm3
+       
+       /* update H2 forces */
+       movaps xmm3, [esp + fixH2]
+       movaps xmm4, [esp + fiyH2]
+       movaps xmm7, [esp + fizH2]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH2], xmm3
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm7
+
+       mov edi, [ebp +faction]
+       /* update j forces */
+       addps xmm0, [esp + fjx]
+       addps xmm1, [esp + fjy]
+       addps xmm2, [esp + fjz]
+
+       movlps xmm4, [edi + eax*4]
+       movlps xmm7, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm7, [edi + edx*4]
+       
+       movaps xmm3, xmm4
+       shufps xmm3, xmm7, 0b10001000
+       shufps xmm4, xmm7, 0b11011101                         
+       /* xmm3 has fjx, xmm4 has fjy */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       /* unpack them back for storing */
+       movaps xmm7, xmm3
+       unpcklps xmm7, xmm4
+       unpckhps xmm3, xmm4     
+       movlps [edi + eax*4], xmm7
+       movlps [edi + ecx*4], xmm3
+       movhps [edi + ebx*4], xmm7
+       movhps [edi + edx*4], xmm3
+       /* finally z forces */
+       movss  xmm0, [edi + eax*4 + 8]
+       movss  xmm1, [edi + ebx*4 + 8]
+       movss  xmm3, [edi + ecx*4 + 8]
+       movss  xmm4, [edi + edx*4 + 8]
+       subss  xmm0, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm1, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm3, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm4, xmm2
+       movss  [edi + eax*4 + 8], xmm0
+       movss  [edi + ebx*4 + 8], xmm1
+       movss  [edi + ecx*4 + 8], xmm3
+       movss  [edi + edx*4 + 8], xmm4
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3020_odd_inner
+       jmp   .i3020_unroll_loop
+.i3020_odd_inner:      
+       add   [esp + innerk],  4
+       jnz   .i3020_odd_loop
+       jmp   .i3020_updateouterdata
+.i3020_odd_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       xorps xmm4, xmm4
+       movss xmm4, [esp + iqO]
+       mov esi, [ebp + charge] 
+       movhps xmm4, [esp + iqH]     
+       movss xmm3, [esi + eax*4]       /* charge in xmm3 */
+       shufps xmm3, xmm3, 0
+       mulps xmm3, xmm4
+       movaps [esp + qqO], xmm3        /* use oxygen qq for storage */
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+       
+       /* move j coords to xmm0-xmm2 */
+       movss xmm0, [esi + eax*4]
+       movss xmm1, [esi + eax*4 + 4]
+       movss xmm2, [esi + eax*4 + 8]
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       
+       movss xmm3, [esp + ixO]
+       movss xmm4, [esp + iyO]
+       movss xmm5, [esp + izO]
+               
+       movlps xmm6, [esp + ixH1]
+       movlps xmm7, [esp + ixH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm3, xmm6
+       movlps xmm6, [esp + iyH1]
+       movlps xmm7, [esp + iyH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm4, xmm6
+       movlps xmm6, [esp + izH1]
+       movlps xmm7, [esp + izH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm5, xmm6
+
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       
+       movaps [esp + dxO], xmm3
+       movaps [esp + dyO], xmm4
+       movaps [esp + dzO], xmm5
+
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       movaps [esp + rinvO], xmm0
+       
+       mulps xmm4, [esp + tsc]
+       movhlps xmm7, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm7    /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm7, mm7
+        movlhps xmm3, xmm7
+
+       subps   xmm4, xmm3      
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+       
+        movd mm0, eax   
+        movd mm1, ecx
+        movd mm2, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm0, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+
+       xorps xmm4, xmm4
+       mulps  xmm0, [esp + tsc]
+       mulps  xmm0, [esp + rinvO]      
+       subps  xmm4, xmm0
+               
+        movd eax, mm0   
+        movd ecx, mm1
+        movd edx, mm2  
+               
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4 /* xmm0-xmm2 now contains tx-tz (partial force) */
+       movss  xmm3, [esp + fixO]       
+       movss  xmm4, [esp + fiyO]       
+       movss  xmm5, [esp + fizO]       
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esp + fixO], xmm3       
+       movss  [esp + fiyO], xmm4       
+       movss  [esp + fizO], xmm5       /* updated the O force now do the H's */
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       shufps xmm3, xmm3, 0b11100110   /* shift right */
+       shufps xmm4, xmm4, 0b11100110
+       shufps xmm5, xmm5, 0b11100110
+       addss  xmm3, [esp + fixH1]
+       addss  xmm4, [esp + fiyH1]
+       addss  xmm5, [esp + fizH1]
+       movss  [esp + fixH1], xmm3      
+       movss  [esp + fiyH1], xmm4      
+       movss  [esp + fizH1], xmm5      /* updated the H1 force */
+
+       mov edi, [ebp + faction]
+       shufps xmm3, xmm3, 0b11100111   /* shift right */
+       shufps xmm4, xmm4, 0b11100111
+       shufps xmm5, xmm5, 0b11100111
+       addss  xmm3, [esp + fixH2]
+       addss  xmm4, [esp + fiyH2]
+       addss  xmm5, [esp + fizH2]
+       movss  [esp + fixH2], xmm3      
+       movss  [esp + fiyH2], xmm4      
+       movss  [esp + fizH2], xmm5      /* updated the H2 force */
+
+       /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+       xorps  xmm5, xmm5
+       movaps xmm3, xmm0
+       movlps xmm6, [edi + eax*4]
+       movss  xmm7, [edi + eax*4 + 8]
+       unpcklps xmm3, xmm1
+       movlhps  xmm3, xmm5     
+       unpckhps xmm0, xmm1             
+       addps    xmm0, xmm3
+       movhlps  xmm3, xmm0     
+       addps    xmm0, xmm3     /* x,y sum in xmm0 */
+
+       movhlps  xmm1, xmm2
+       addss    xmm2, xmm1
+       shufps   xmm1, xmm1, 1 
+       addss    xmm2, xmm1    /* z sum in xmm2 */
+       subps    xmm6, xmm0
+       subss    xmm7, xmm2
+       
+       movlps [edi + eax*4],     xmm6
+       movss  [edi + eax*4 + 8], xmm7
+
+       dec dword ptr [esp + innerk]
+       jz    .i3020_updateouterdata
+       jmp   .i3020_odd_loop
+.i3020_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO]
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       mov   edx, [ebp + gid]  
+       mov   edx, [edx]
+       add   [ebp + gid],  4   
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+        
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3020_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3020_outer
+.i3020_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 740
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+       
+
+       
+.globl inl3030_sse
+       .type inl3030_sse,@function
+inl3030_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           tabscale,       60      
+.equ           VFtab,          64
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           jxO,            144
+.equ           jyO,            160
+.equ           jzO,            176
+.equ           jxH1,           192
+.equ           jyH1,           208
+.equ           jzH1,           224
+.equ           jxH2,           240
+.equ           jyH2,           256
+.equ           jzH2,           272
+.equ           dxOO,           288
+.equ           dyOO,           304
+.equ           dzOO,           320     
+.equ           dxOH1,          336
+.equ           dyOH1,          352
+.equ           dzOH1,          368     
+.equ           dxOH2,          384
+.equ           dyOH2,          400
+.equ           dzOH2,          416     
+.equ           dxH1O,          432
+.equ           dyH1O,          448
+.equ           dzH1O,          464     
+.equ           dxH1H1,         480
+.equ           dyH1H1,         496
+.equ           dzH1H1,         512     
+.equ           dxH1H2,         528
+.equ           dyH1H2,         544
+.equ           dzH1H2,         560     
+.equ           dxH2O,          576
+.equ           dyH2O,          592
+.equ           dzH2O,          608     
+.equ           dxH2H1,         624
+.equ           dyH2H1,         640
+.equ           dzH2H1,         656     
+.equ           dxH2H2,         672
+.equ           dyH2H2,         688
+.equ           dzH2H2,         704
+.equ           qqOO,           720
+.equ           qqOH,           736
+.equ           qqHH,           752
+.equ           two,            768
+.equ           tsc,            784
+.equ           vctot,          800
+.equ           fixO,           816
+.equ           fiyO,           832
+.equ           fizO,           848
+.equ           fixH1,          864
+.equ           fiyH1,          880
+.equ           fizH1,          896
+.equ           fixH2,          912
+.equ           fiyH2,          928
+.equ           fizH2,          944
+.equ           fjxO,           960
+.equ           fjyO,           976
+.equ           fjzO,           992
+.equ           fjxH1,         1008
+.equ           fjyH1,         1024
+.equ           fjzH1,         1040
+.equ           fjxH2,         1056
+.equ           fjyH2,         1072
+.equ           fjzH2,         1088
+.equ           half,          1104
+.equ           three,         1120
+.equ           rsqOO,         1136
+.equ           rsqOH1,        1152
+.equ           rsqOH2,        1168
+.equ           rsqH1O,        1184
+.equ           rsqH1H1,       1200
+.equ           rsqH1H2,       1216
+.equ           rsqH2O,        1232
+.equ           rsqH2H1,       1248
+.equ           rsqH2H2,       1264
+.equ           rinvOO,        1280
+.equ           rinvOH1,       1296
+.equ           rinvOH2,       1312
+.equ           rinvH1O,       1328
+.equ           rinvH1H1,      1344
+.equ           rinvH1H2,      1360
+.equ           rinvH2O,       1376
+.equ           rinvH2H1,      1392
+.equ           rinvH2H2,      1408     
+.equ           is3,           1424
+.equ           ii3,           1428
+.equ           innerjjnr,     1432
+.equ           innerk,        1436
+.equ           salign,        1440                                                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 1444           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp +tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two],  xmm1
+       movaps [esp + three], xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc],  xmm3
+
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, xmm3        
+       movss xmm5, [edx + ebx*4 + 4]   
+       movss xmm6, [ebp + facel]
+       mulss  xmm3, xmm3
+       mulss  xmm4, xmm5
+       mulss  xmm5, xmm5
+       mulss  xmm3, xmm6
+       mulss  xmm4, xmm6
+       mulss  xmm5, xmm6
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + qqOO], xmm3
+       movaps [esp + qqOH], xmm4
+       movaps [esp + qqHH], xmm5               
+
+.i3030_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx  
+       
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3030_unroll_loop
+       jmp   .i3030_single_check
+.i3030_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4] 
+       mov   ecx, [edx + 8]
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+       
+       /* move j coordinates to local temp variables */
+       movlps xmm2, [esi + eax*4]
+       movlps xmm3, [esi + eax*4 + 12]
+       movlps xmm4, [esi + eax*4 + 24]
+
+       movlps xmm5, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 12]
+       movlps xmm7, [esi + ebx*4 + 24]
+
+       movhps xmm2, [esi + ecx*4]
+       movhps xmm3, [esi + ecx*4 + 12]
+       movhps xmm4, [esi + ecx*4 + 24]
+
+       movhps xmm5, [esi + edx*4]
+       movhps xmm6, [esi + edx*4 + 12]
+       movhps xmm7, [esi + edx*4 + 24]
+
+       /* current state: */    
+       /* xmm2= jxOa  jyOa  jxOc  jyOc */
+       /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+       /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+       /* xmm5= jxOb  jyOb  jxOd  jyOd */
+       /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+       /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+       
+       movaps xmm0, xmm2
+       movaps xmm1, xmm3
+       unpcklps xmm0, xmm5     /* xmm0= jxOa  jxOb  jyOa  jyOb */
+       unpcklps xmm1, xmm6     /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+       unpckhps xmm2, xmm5     /* xmm2= jxOc  jxOd  jyOc  jyOd */
+       unpckhps xmm3, xmm6     /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+       movaps xmm5, xmm4
+       movaps   xmm6, xmm0
+       unpcklps xmm4, xmm7     /* xmm4= jxH2a jxH2b jyH2a jyH2b */             
+       unpckhps xmm5, xmm7     /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+       movaps   xmm7, xmm1
+       movlhps  xmm0, xmm2     /* xmm0= jxOa  jxOb  jxOc  jxOd */
+       movaps [esp + jxO], xmm0
+       movhlps  xmm2, xmm6     /* xmm2= jyOa  jyOb  jyOc  jyOd */
+       movaps [esp + jyO], xmm2
+       movlhps  xmm1, xmm3
+       movaps [esp + jxH1], xmm1
+       movhlps  xmm3, xmm7
+       movaps   xmm6, xmm4
+       movaps [esp + jyH1], xmm3
+       movlhps  xmm4, xmm5
+       movaps [esp + jxH2], xmm4
+       movhlps  xmm5, xmm6
+       movaps [esp + jyH2], xmm5
+
+       movss  xmm0, [esi + eax*4 + 8]
+       movss  xmm1, [esi + eax*4 + 20]
+       movss  xmm2, [esi + eax*4 + 32]
+
+       movss  xmm3, [esi + ecx*4 + 8]
+       movss  xmm4, [esi + ecx*4 + 20]
+       movss  xmm5, [esi + ecx*4 + 32]
+
+       movhps xmm0, [esi + ebx*4 + 4]
+       movhps xmm1, [esi + ebx*4 + 16]
+       movhps xmm2, [esi + ebx*4 + 28]
+       
+       movhps xmm3, [esi + edx*4 + 4]
+       movhps xmm4, [esi + edx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 28]
+       
+       shufps xmm0, xmm3, 0b11001100
+       shufps xmm1, xmm4, 0b11001100
+       shufps xmm2, xmm5, 0b11001100
+       movaps [esp + jzO],  xmm0
+       movaps [esp + jzH1],  xmm1
+       movaps [esp + jzH2],  xmm2
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixO]
+       movaps xmm4, [esp + iyO]
+       movaps xmm5, [esp + izO]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxOH1], xmm3
+       movaps [esp + dyOH1], xmm4
+       movaps [esp + dzOH1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOO], xmm0
+       movaps [esp + rsqOH1], xmm3
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       subps  xmm3, [esp + jxO]
+       subps  xmm4, [esp + jyO]
+       subps  xmm5, [esp + jzO]
+       movaps [esp + dxOH2], xmm0
+       movaps [esp + dyOH2], xmm1
+       movaps [esp + dzOH2], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1O], xmm3
+       movaps [esp + dyH1O], xmm4
+       movaps [esp + dzH1O], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOH2], xmm0
+       movaps [esp + rsqH1O], xmm3
+
+       movaps xmm0, [esp + ixH1]
+       movaps xmm1, [esp + iyH1]
+       movaps xmm2, [esp + izH1]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH1]
+       subps  xmm1, [esp + jyH1]
+       subps  xmm2, [esp + jzH1]
+       subps  xmm3, [esp + jxH2]
+       subps  xmm4, [esp + jyH2]
+       subps  xmm5, [esp + jzH2]
+       movaps [esp + dxH1H1], xmm0
+       movaps [esp + dyH1H1], xmm1
+       movaps [esp + dzH1H1], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1H2], xmm3
+       movaps [esp + dyH1H2], xmm4
+       movaps [esp + dzH1H2], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqH1H1], xmm0
+       movaps [esp + rsqH1H2], xmm3
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxH2O], xmm0
+       movaps [esp + dyH2O], xmm1
+       movaps [esp + dzH2O], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH2H1], xmm3
+       movaps [esp + dyH2H1], xmm4
+       movaps [esp + dzH2H1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       movaps [esp + rsqH2O], xmm0
+       movaps [esp + rsqH2H1], xmm4
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       movaps [esp + dxH2H2], xmm0
+       movaps [esp + dyH2H2], xmm1
+       movaps [esp + dzH2H2], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2
+       movaps [esp + rsqH2H2], xmm0
+               
+       /* start doing invsqrt use rsq values in xmm0, xmm4 */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinvH2H2 */
+       mulps   xmm7, [esp + half] /* rinvH2H1 */
+       movaps  [esp + rinvH2H2], xmm3
+       movaps  [esp + rinvH2H1], xmm7
+               
+       rsqrtps xmm1, [esp + rsqOO]
+       rsqrtps xmm5, [esp + rsqOH1]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOO]
+       mulps   xmm5, [esp + rsqOH1]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOO], xmm3
+       movaps  [esp + rinvOH1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOH2]
+       rsqrtps xmm5, [esp + rsqH1O]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOH2]
+       mulps   xmm5, [esp + rsqH1O]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOH2], xmm3
+       movaps  [esp + rinvH1O], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH1H1]
+       rsqrtps xmm5, [esp + rsqH1H2]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqH1H1]
+       mulps   xmm5, [esp + rsqH1H2]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvH1H1], xmm3
+       movaps  [esp + rinvH1H2], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH2O]
+       movaps  xmm2, xmm1
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, [esp + rsqH2O]
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2
+       mulps   xmm3, [esp + half] 
+       movaps  [esp + rinvH2O], xmm3
+
+       /* start with OO interaction */
+       movaps xmm0, [esp + rinvOO]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOO] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+               
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+       
+        movd mm0, eax
+        movd mm1, ebx
+        movd mm2, ecx
+        movd mm3, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        /* update vctot */
+        addps  xmm5, [esp + vctot]
+       xorps  xmm2, xmm2
+        movaps [esp + vctot], xmm5
+       mulps  xmm3, [esp + tsc]
+       
+       subps  xmm2, xmm3
+       mulps  xmm0, xmm2
+       
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0               
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOO]
+       mulps xmm1, [esp + dyOO]
+       mulps xmm2, [esp + dzOO]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H1 interaction */
+       movaps xmm0, [esp + rinvOH1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOH1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+       
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH1]
+       mulps xmm1, [esp + dyOH1]
+       mulps xmm2, [esp + dzOH1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H2 interaction */ 
+       movaps xmm0, [esp + rinvOH2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOH2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH2]
+       mulps xmm1, [esp + dyOH2]
+       mulps xmm2, [esp + dzOH2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* H1-O interaction */
+       movaps xmm0, [esp + rinvH1O]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1O] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH1O]
+       mulps xmm1, [esp + dyH1O]
+       mulps xmm2, [esp + dzH1O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H1 interaction */
+       movaps xmm0, [esp + rinvH1H1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1H1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH1H1]
+       mulps xmm1, [esp + dyH1H1]
+       mulps xmm2, [esp + dzH1H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H2 interaction */
+       movaps xmm0, [esp + rinvH1H2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1H2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH1H2]
+       mulps xmm1, [esp + dyH1H2]
+       mulps xmm2, [esp + dzH1H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H2-O interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2O] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH2O]
+       mulps xmm1, [esp + dyH2O]
+       mulps xmm2, [esp + dzH2O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H1 interaction */
+       movaps xmm0, [esp + rinvH2H1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2H1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH2H1]
+       mulps xmm1, [esp + dyH2H1]
+       mulps xmm2, [esp + dzH2H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H2 interaction */
+       movaps xmm0, [esp + rinvH2H2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2H2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH2H2]
+       mulps xmm1, [esp + dyH2H2]
+       mulps xmm2, [esp + dzH2H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       mov edi, [ebp +faction]
+
+       movd eax, mm0
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+       
+       /* Did all interactions - now update j forces */
+       /* 4 j waters with three atoms each - first do a & b j particles */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpcklps xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjxOb  fjyOb */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOb  fjyOb */ 
+       unpcklps xmm1, xmm2        /* xmm1= fjzOa  fjxH1a fjzOb  fjxH1b */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpcklps xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1b fjzH1b */
+       unpcklps xmm5, xmm6        /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjzOa  fjxH1a */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOb  fjyOb  fjzOb  fjxH1b */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+       movups   xmm1, [edi + eax*4]
+       movups   xmm2, [edi + eax*4 + 16]
+       movups   xmm5, [edi + ebx*4]
+       movups   xmm6, [edi + ebx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + eax*4 + 32]
+       movss    xmm3, [edi + ebx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm7, xmm7, 1
+       
+       movups   [edi + eax*4],     xmm1
+       movups   [edi + eax*4 + 16],xmm2
+       movups   [edi + ebx*4],     xmm5
+       movups   [edi + ebx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + eax*4 + 32], xmm0
+       movss    [edi + ebx*4 + 32], xmm3       
+
+       /* then do the second pair (c & d) */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpckhps xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjxOd  fjyOd */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOd  fjyOd */ 
+       unpckhps xmm1, xmm2        /* xmm1= fjzOc  fjxH1c fjzOd  fjxH1d */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpckhps xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1d fjzH1d */     
+       unpckhps xmm5, xmm6        /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjzOc  fjxH1c */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOd  fjyOd  fjzOd  fjxH1d */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c  */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+       movups   xmm1, [edi + ecx*4]
+       movups   xmm2, [edi + ecx*4 + 16]
+       movups   xmm5, [edi + edx*4]
+       movups   xmm6, [edi + edx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + ecx*4 + 32]
+       movss    xmm3, [edi + edx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm4, xmm4, 0b10
+       shufps   xmm7, xmm7, 0b11
+       movups   [edi + ecx*4],     xmm1
+       movups   [edi + ecx*4 + 16],xmm2
+       movups   [edi + edx*4],     xmm5
+       movups   [edi + edx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + ecx*4 + 32], xmm0
+       movss    [edi + edx*4 + 32], xmm3       
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3030_single_check
+       jmp   .i3030_unroll_loop
+.i3030_single_check:
+       add   [esp + innerk],  4
+       jnz   .i3030_single_loop
+       jmp   .i3030_updateouterdata
+.i3030_single_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+
+       /* fetch j coordinates */
+       xorps xmm3, xmm3
+       xorps xmm4, xmm4
+       xorps xmm5, xmm5
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + eax*4 + 4]
+       movss xmm5, [esi + eax*4 + 8]
+
+       movlps xmm6, [esi + eax*4 + 12]
+       movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+       /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+       movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+       shufps xmm6, xmm6, 0b11011000    /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+       movlhps xmm3, xmm6              /* xmm3= jxO   0  jxH1 jxH2 */
+       movaps  xmm0, [esp + ixO]     
+       movaps  xmm1, [esp + iyO]
+       movaps  xmm2, [esp + izO]       
+       shufps  xmm4, xmm6, 0b11100100 /* xmm4= jyO   0   jyH1 jyH2 */
+       shufps xmm5, xmm7, 0b11000100  /* xmm5= jzO   0   jzH1 jzH2 */
+       /* store all j coordinates in jO */ 
+       movaps [esp + jxO], xmm3
+       movaps [esp + jyO], xmm4
+       movaps [esp + jzO], xmm5
+       subps  xmm0, xmm3
+       subps  xmm1, xmm4
+       subps  xmm2, xmm5
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2        /* have rsq in xmm0 */
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       movaps  xmm2, xmm1      
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, xmm0
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2                                                      
+       mulps   xmm3, [esp + half] /* rinv iO - j water */
+
+       movaps  xmm1, xmm3
+       mulps   xmm1, xmm0      /* xmm1=r */
+       movaps  xmm0, xmm3      /* xmm0=rinv */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+       
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+       mov esi, [ebp + VFtab]
+       
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOO]
+       movhps  xmm3, [esp + qqOH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+       
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm2, xmm2
+       mulps  xmm3, [esp + tsc]
+
+       subps  xmm2, xmm3
+       mulps  xmm0, xmm2
+       
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0                       
+
+       mulps   xmm0, [esp + dxOO]
+       mulps   xmm1, [esp + dyOO]
+       mulps   xmm2, [esp + dzOO]
+       /* initial update for j forces */
+       xorps   xmm3, xmm3
+       xorps   xmm4, xmm4
+       xorps   xmm5, xmm5
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixO]
+       addps   xmm1, [esp + fiyO]
+       addps   xmm2, [esp + fizO]
+       movaps  [esp + fixO], xmm0
+       movaps  [esp + fiyO], xmm1
+       movaps  [esp + fizO], xmm2
+
+       
+       /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+       movaps  xmm0, [esp + ixH1]
+       movaps  xmm1, [esp + iyH1]
+       movaps  xmm2, [esp + izH1]      
+       movaps  xmm3, [esp + ixH2] 
+       movaps  xmm4, [esp + iyH2] 
+       movaps  xmm5, [esp + izH2] 
+       subps   xmm0, [esp + jxO]
+       subps   xmm1, [esp + jyO]
+       subps   xmm2, [esp + jzO]
+       subps   xmm3, [esp + jxO]
+       subps   xmm4, [esp + jyO]
+       subps   xmm5, [esp + jzO]
+       movaps [esp + dxH1O], xmm0
+       movaps [esp + dyH1O], xmm1
+       movaps [esp + dzH1O], xmm2
+       movaps [esp + dxH2O], xmm3
+       movaps [esp + dyH2O], xmm4
+       movaps [esp + dzH2O], xmm5
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       mulps xmm3, xmm3
+       mulps xmm4, xmm4
+       mulps xmm5, xmm5
+       addps xmm0, xmm1
+       addps xmm4, xmm3
+       addps xmm0, xmm2        /* have rsqH1 in xmm0 */
+       addps xmm4, xmm5        /* have rsqH2 in xmm4 */
+
+       /* start with H1, save H2 data */
+       movaps [esp + rsqH2O], xmm4
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinv H1 - j water */
+       mulps   xmm7, [esp + half] /* rinv H2 - j water */ 
+
+       /* start with H1, save H2 data */
+       movaps [esp + rinvH2O], xmm7
+
+       movaps xmm1, xmm3
+       mulps  xmm1, xmm0       /* xmm1=r */
+       movaps xmm0, xmm3       /* xmm0=rinv */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOH]
+       movhps  xmm3, [esp + qqHH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5     
+
+        xorps  xmm1, xmm1
+
+        mulps xmm3, [esp + tsc]
+        mulps xmm3, xmm0
+        subps  xmm1, xmm3
+       
+       movaps  xmm0, xmm1
+       movaps  xmm2, xmm1
+       mulps   xmm0, [esp + dxH1O]
+       mulps   xmm1, [esp + dyH1O]
+       mulps   xmm2, [esp + dzH1O]
+       /* update forces H1 - j water */
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH1]
+       addps   xmm1, [esp + fiyH1]
+       addps   xmm2, [esp + fizH1]
+       movaps  [esp + fixH1], xmm0
+       movaps  [esp + fiyH1], xmm1
+       movaps  [esp + fizH1], xmm2
+       /* do table for H2 - j water interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, [esp + rsqH2O]
+       mulps  xmm1, xmm0       /* xmm0=rinv, xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOH]
+       movhps  xmm3, [esp + qqHH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5     
+
+        xorps  xmm1, xmm1
+
+        mulps xmm3, [esp + tsc]
+        mulps xmm3, xmm0
+        subps  xmm1, xmm3
+       
+       movaps  xmm0, xmm1
+       movaps  xmm2, xmm1
+       
+       mulps   xmm0, [esp + dxH2O]
+       mulps   xmm1, [esp + dyH2O]
+       mulps   xmm2, [esp + dzH2O]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       mov     esi, [ebp + faction]
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH2]
+       addps   xmm1, [esp + fiyH2]
+       addps   xmm2, [esp + fizH2]
+       movaps  [esp + fixH2], xmm0
+       movaps  [esp + fiyH2], xmm1
+       movaps  [esp + fizH2], xmm2
+
+       /* update j water forces from local variables */
+       movlps  xmm0, [esi + eax*4]
+       movlps  xmm1, [esi + eax*4 + 12]
+       movhps  xmm1, [esi + eax*4 + 24]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       movaps  xmm6, xmm5
+       movaps  xmm7, xmm5
+       shufps  xmm6, xmm6, 0b10
+       shufps  xmm7, xmm7, 0b11
+       addss   xmm5, [esi + eax*4 + 8]
+       addss   xmm6, [esi + eax*4 + 20]
+       addss   xmm7, [esi + eax*4 + 32]
+       movss   [esi + eax*4 + 8], xmm5
+       movss   [esi + eax*4 + 20], xmm6
+       movss   [esi + eax*4 + 32], xmm7
+       movaps   xmm5, xmm3
+       unpcklps xmm3, xmm4
+       unpckhps xmm5, xmm4
+       addps    xmm0, xmm3
+       addps    xmm1, xmm5
+       movlps  [esi + eax*4], xmm0 
+       movlps  [esi + eax*4 + 12], xmm1 
+       movhps  [esi + eax*4 + 24], xmm1 
+       
+       dec dword ptr [esp + innerk]
+       jz    .i3030_updateouterdata
+       jmp   .i3030_single_loop
+.i3030_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO] 
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3030_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3030_outer
+.i3030_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 1444
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+       
+
+
+
+.globl inl3100_sse
+       .type inl3100_sse,@function
+inl3100_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96
+.equ           two,            112
+.equ           six,            128
+.equ           twelve,         144
+.equ           tsc,            160
+.equ           qq,             176     
+.equ           c6,             192
+.equ           c12,            208
+.equ           fscal,          224
+.equ           vctot,          240
+.equ           vnbtot,         256
+.equ           fix,            272
+.equ           fiy,            288
+.equ           fiz,            304
+.equ           half,           320
+.equ           three,          336
+.equ           is3,            352
+.equ           ii3,            356
+.equ           ntia,           360     
+.equ           innerjjnr,      364
+.equ           innerk,         368
+.equ           salign,         372
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 376            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movups xmm3, [sse_six]
+       movups xmm4, [sse_twelve]
+       movss xmm5, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three],  xmm2
+       movaps [esp + six],  xmm3
+       movaps [esp + twelve],  xmm4
+       shufps xmm5, xmm5, 0
+       movaps [esp + tsc], xmm5
+
+       /* assume we have at least one i particle - start directly */   
+.i3100_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3100_unroll_loop
+       jmp   .i3100_finish_inner
+.i3100_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       mulps  xmm3, xmm2
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps [esp + qq], xmm3
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* L-J */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm0       /* xmm4=rinvsq */
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm6, xmm4
+       mulps  xmm6, xmm4
+
+       movaps [esp + vctot], xmm5 
+
+       mulps  xmm6, xmm4       /* xmm6=rinvsix */
+       movaps xmm4, xmm6
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm6, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm7, [esp + vnbtot]
+       addps  xmm7, xmm4
+       mulps  xmm4, [esp + twelve]
+       subps  xmm7, xmm6
+       mulps  xmm3, [esp + tsc]
+       mulps  xmm6, [esp + six]
+       movaps [esp + vnbtot], xmm7
+       subps  xmm4, xmm6
+       mulps  xmm4, xmm0
+       subps  xmm4, xmm3
+       mulps  xmm4, xmm0
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3100_finish_inner
+       jmp   .i3100_unroll_loop
+.i3100_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3100_dopair
+       jmp   .i3100_checksingle
+.i3100_dopair: 
+       mov esi, [ebp + charge]
+        mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* L-J */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm0       /* xmm4=rinvsq */
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm6, xmm4
+       mulps  xmm6, xmm4
+
+       movaps [esp + vctot], xmm5 
+
+       mulps  xmm6, xmm4       /* xmm6=rinvsix */
+       movaps xmm4, xmm6
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm6, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm7, [esp + vnbtot]
+       addps  xmm7, xmm4
+       mulps  xmm4, [esp + twelve]
+       subps  xmm7, xmm6
+       mulps  xmm3, [esp + tsc]
+       mulps  xmm6, [esp + six]
+       movaps [esp + vnbtot], xmm7
+       subps  xmm4, xmm6
+       mulps  xmm4, xmm0
+       subps  xmm4, xmm3
+       mulps  xmm4, xmm0
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3100_checksingle:                            
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3100_dosingle
+       jmp    .i3100_updateouterdata
+.i3100_dosingle:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* L-J */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm0       /* xmm4=rinvsq */
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm6, xmm4
+       mulps  xmm6, xmm4
+
+       movaps [esp + vctot], xmm5 
+
+       mulps  xmm6, xmm4       /* xmm6=rinvsix */
+       movaps xmm4, xmm6
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm6, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm7, [esp + vnbtot]
+       addps  xmm7, xmm4
+       mulps  xmm4, [esp + twelve]
+       subps  xmm7, xmm6
+       mulps  xmm3, [esp + tsc]
+       mulps  xmm6, [esp + six]
+       movaps [esp + vnbtot], xmm7
+       subps  xmm4, xmm6
+       mulps  xmm4, xmm0
+       subps  xmm4, xmm3
+       mulps  xmm4, xmm0
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3100_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3100_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3100_outer
+.i3100_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 376
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl3110_sse
+       .type inl3110_sse,@function
+inl3110_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+.equ           nsatoms,        84                      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96
+.equ           two,            112
+.equ           tsc,            128
+.equ           qq,             144     
+.equ           c6,             160
+.equ           c12,            176
+.equ           six,            192
+.equ           twelve,         208
+.equ           fscal,          224
+.equ           vctot,          240
+.equ           vnbtot,         256
+.equ           fix,            272
+.equ           fiy,            288
+.equ           fiz,            304
+.equ           half,           320
+.equ           three,          336
+.equ           is3,            352
+.equ           ii3,            356
+.equ           shX,            360
+.equ           shY,            364
+.equ           shZ,            368
+.equ           ntia,           372     
+.equ           innerjjnr0,     376
+.equ           innerk0,        380     
+.equ           innerjjnr,      384
+.equ           innerk,         388
+.equ           salign,         392                                                     
+.equ           nsvdwc,         396
+.equ           nscoul,         400
+.equ           nsvdw,          404
+.equ           solnr,          408             
+       push ebp
+       mov ebp,esp     
+       push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 412            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movups xmm3, [sse_six]
+       movups xmm4, [sse_twelve]
+       movss xmm5, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three], xmm2
+       movaps [esp + six],  xmm3
+       movaps [esp + twelve], xmm4
+       shufps xmm5, xmm5, 0
+       movaps [esp + tsc], xmm5
+
+       /* assume we have at least one i particle - start directly */   
+.i3110_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movlps xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 8] 
+       movlps [esp + shX], xmm0
+       movss [esp + shZ], xmm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       mov   [esp + solnr],  ebx
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i3110_mno_vdwc
+       jmp   .i3110_testcoul
+.i3110_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3110_unroll_vdwc_loop
+       jmp   .i3110_finish_vdwc_inner
+.i3110_unroll_vdwc_loop:       
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       mulps  xmm3, xmm2
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps [esp + qq], xmm3
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* L-J */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm0       /* xmm4=rinvsq */
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm6, xmm4
+       mulps  xmm6, xmm4
+
+       movaps [esp + vctot], xmm5 
+
+       mulps  xmm6, xmm4       /* xmm6=rinvsix */
+       movaps xmm4, xmm6
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm6, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm7, [esp + vnbtot]
+       addps  xmm7, xmm4
+       mulps  xmm4, [esp + twelve]
+       subps  xmm7, xmm6
+       mulps  xmm3, [esp + tsc]
+       mulps  xmm6, [esp + six]
+       movaps [esp + vnbtot], xmm7
+       subps  xmm4, xmm6
+       mulps  xmm4, xmm0
+       subps  xmm4, xmm3
+       mulps  xmm4, xmm0
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3110_finish_vdwc_inner
+       jmp   .i3110_unroll_vdwc_loop
+.i3110_finish_vdwc_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3110_dopair_vdwc
+       jmp   .i3110_checksingle_vdwc
+.i3110_dopair_vdwc:    
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* L-J */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm0       /* xmm4=rinvsq */
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm6, xmm4
+       mulps  xmm6, xmm4
+
+       movaps [esp + vctot], xmm5 
+
+       mulps  xmm6, xmm4       /* xmm6=rinvsix */
+       movaps xmm4, xmm6
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm6, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm7, [esp + vnbtot]
+       addps  xmm7, xmm4
+       mulps  xmm4, [esp + twelve]
+       subps  xmm7, xmm6
+       mulps  xmm3, [esp + tsc]
+       mulps  xmm6, [esp + six]
+       movaps [esp + vnbtot], xmm7
+       subps  xmm4, xmm6
+       mulps  xmm4, xmm0
+       subps  xmm4, xmm3
+       mulps  xmm4, xmm0
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       mov    edi, [ebp + faction]
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3110_checksingle_vdwc:                               
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3110_dosingle_vdwc
+       jmp    .i3110_updateouterdata_vdwc
+.i3110_dosingle_vdwc:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+                                               
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* L-J */
+       movaps xmm4, xmm0
+       mulps  xmm4, xmm0       /* xmm4=rinvsq */
+
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+
+       movaps xmm6, xmm4
+       mulps  xmm6, xmm4
+
+       movaps [esp + vctot], xmm5 
+
+       mulps  xmm6, xmm4       /* xmm6=rinvsix */
+       movaps xmm4, xmm6
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm6, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm7, [esp + vnbtot]
+       addps  xmm7, xmm4
+       mulps  xmm4, [esp + twelve]
+       subps  xmm7, xmm6
+       mulps  xmm3, [esp + tsc]
+       mulps  xmm6, [esp + six]
+       movaps [esp + vnbtot], xmm7
+       subps  xmm4, xmm6
+       mulps  xmm4, xmm0
+       subps  xmm4, xmm3
+       mulps  xmm4, xmm0
+
+       mov edi, [ebp +faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3110_updateouterdata_vdwc:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+
+       /* loop back to mno */
+       dec  dword ptr [esp + nsvdwc]
+       jz  .i3110_testcoul
+       jmp .i3110_mno_vdwc
+.i3110_testcoul:
+       mov  ecx, [esp + nscoul]
+       cmp  ecx,  0
+       jnz  .i3110_mno_coul
+       jmp  .i3110_testvdw
+.i3110_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3110_unroll_coul_loop
+       jmp   .i3110_finish_coul_inner
+
+.i3110_unroll_coul_loop:       
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       mulps  xmm3, xmm2
+
+       movaps [esp + qq], xmm3 
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3110_finish_coul_inner
+       jmp   .i3110_unroll_coul_loop
+.i3110_finish_coul_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3110_dopair_coul
+       jmp   .i3110_checksingle_coul
+.i3110_dopair_coul:    
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov edi, [ebp + pos]    
+       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3110_checksingle_coul:                               
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3110_dosingle_coul
+       jmp    .i3110_updateouterdata_coul
+.i3110_dosingle_coul:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3110_updateouterdata_coul:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* loop back to mno */
+       dec  dword ptr [esp + nscoul]
+       jz  .i3110_testvdw
+       jmp .i3110_mno_coul
+.i3110_testvdw:
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i3110_mno_vdw
+       jmp  .i3110_last_mno
+.i3110_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3110_unroll_vdw_loop
+       jmp   .i3110_finish_vdw_inner
+.i3110_unroll_vdw_loop:        
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3110_finish_vdw_inner
+       jmp   .i3110_unroll_vdw_loop
+.i3110_finish_vdw_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3110_dopair_vdw
+       jmp   .i3110_checksingle_vdw
+.i3110_dopair_vdw:     
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3110_checksingle_vdw:                                
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3110_dosingle_vdw
+       jmp    .i3110_updateouterdata_vdw
+.i3110_dosingle_vdw:
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rcpps xmm5, xmm4
+       /* 1/x lookup seed in xmm5 */
+       movaps xmm0, [esp + two]
+       mulps xmm4, xmm5
+       subps xmm0, xmm4
+       mulps xmm0, xmm5        /* xmm0=rinvsq */
+       movaps xmm4, xmm0
+       
+       movaps xmm1, xmm0
+       mulps  xmm1, xmm0
+       mulps  xmm1, xmm0       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm5, xmm2
+       subps  xmm5, xmm1       /* vnb=vnb12-vnb6 */
+       addps  xmm5, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       subps  xmm2, xmm1
+       mulps  xmm4, xmm2       /* xmm4=total fscal */
+       
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movaps [esp + vnbtot], xmm5
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       mov edi, [ebp +faction]
+
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5
+.i3110_updateouterdata_vdw:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+       
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i3110_last_mno
+       jmp .i3110_mno_vdw
+.i3110_last_mno:       
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3110_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3110_outer
+.i3110_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 412
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+.globl inl3120_sse
+       .type inl3120_sse,@function
+inl3120_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72      
+.equ           tabscale,       76      
+.equ           VFtab,          80      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           iqO,            144 
+.equ           iqH,            160 
+.equ           dxO,            176
+.equ           dyO,            192
+.equ           dzO,            208     
+.equ           dxH1,           224
+.equ           dyH1,           240
+.equ           dzH1,           256     
+.equ           dxH2,           272
+.equ           dyH2,           288
+.equ           dzH2,           304     
+.equ           qqO,            320
+.equ           qqH,            336
+.equ           rinvO,          352
+.equ           rinvH1,         368
+.equ           rinvH2,         384             
+.equ           rO,             400
+.equ           rH1,            416
+.equ           rH2,            432
+.equ           tsc,            448     
+.equ           two,            464
+.equ           c6,             480
+.equ           c12,            496
+.equ           six,            512
+.equ           twelve,         528
+.equ           vctot,          544
+.equ           vnbtot,         560
+.equ           fixO,           576
+.equ           fiyO,           592
+.equ           fizO,           608
+.equ           fixH1,          624
+.equ           fiyH1,          640
+.equ           fizH1,          656
+.equ           fixH2,          672
+.equ           fiyH2,          688
+.equ           fizH2,          704
+.equ           fjx,            720
+.equ           fjy,            736
+.equ           fjz,            752
+.equ           half,           768
+.equ           three,          784
+.equ           is3,            800
+.equ           ii3,            804
+.equ           ntia,           808     
+.equ           innerjjnr,      812
+.equ           innerk,         816
+.equ           salign,         820                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 824            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movups xmm3, [sse_six]
+       movups xmm4, [sse_twelve]
+       movss xmm5, [ebp +tabscale]
+       
+       movaps [esp + half],  xmm0
+       movaps [esp + two],  xmm1
+       movaps [esp + three],  xmm2
+       movaps [esp + six],  xmm3
+       movaps [esp + twelve],  xmm4
+       shufps xmm5, xmm5, 0
+       movaps [esp + tsc], xmm5
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, [edx + ebx*4 + 4]   
+       movss xmm5, [ebp + facel]
+       mulss  xmm3, xmm5
+       mulss  xmm4, xmm5
+
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       movaps [esp + iqO], xmm3
+       movaps [esp + iqH], xmm4
+       
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       mov   [esp + ntia], ecx         
+.i3120_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3120_unroll_loop
+       jmp   .i3120_odd_inner
+.i3120_unroll_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movaps xmm4, xmm3            /* and in xmm4 */
+       mulps  xmm3, [esp + iqO]
+       mulps  xmm4, [esp + iqH]
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps  [esp + qqO], xmm3
+       movaps  [esp + qqH], xmm4
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ixO-izO to xmm4-xmm6 */
+       movaps xmm4, [esp + ixO]
+       movaps xmm5, [esp + iyO]
+       movaps xmm6, [esp + izO]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxO], xmm4
+       movaps [esp + dyO], xmm5
+       movaps [esp + dzO], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       movaps xmm7, xmm4
+       /* rsqO in xmm7 */
+
+       /* move ixH1-izH1 to xmm4-xmm6 */
+       movaps xmm4, [esp + ixH1]
+       movaps xmm5, [esp + iyH1]
+       movaps xmm6, [esp + izH1]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxH1], xmm4
+       movaps [esp + dyH1], xmm5
+       movaps [esp + dzH1], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm6, xmm5
+       addps xmm6, xmm4
+       /* rsqH1 in xmm6 */
+
+       /* move ixH2-izH2 to xmm3-xmm5 */ 
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+
+       /* calc dr */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+
+       /* store dr */
+       movaps [esp + dxH2], xmm3
+       movaps [esp + dyH2], xmm4
+       movaps [esp + dzH2], xmm5
+       /* square it */
+       mulps xmm3,xmm3
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       addps xmm5, xmm4
+       addps xmm5, xmm3
+       /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+       /* start with rsqO - seed to xmm2 */    
+       rsqrtps xmm2, xmm7
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm7      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvO], xmm4     /* rinvO in xmm4 */
+       mulps   xmm7, xmm4
+       movaps  [esp + rO], xmm7        
+
+       /* rsqH1 - seed in xmm2 */
+       rsqrtps xmm2, xmm6
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm6      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvH1], xmm4    /* rinvH1 in xmm4 */
+       mulps   xmm6, xmm4
+       movaps  [esp + rH1], xmm6
+
+       /* rsqH2 - seed to xmm2 */
+       rsqrtps xmm2, xmm5
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm5      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvH2], xmm4    /* rinvH2 in xmm4 */
+       mulps   xmm5, xmm4
+       movaps  [esp + rH2], xmm5
+
+       /* do O interactions */
+       /* rO is still in xmm7 */
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd mm0, eax   
+        movd mm1, ebx
+        movd mm2, ecx
+        movd mm3, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm0, xmm7 /* fijC=FF*qq */
+
+       /* do nontable L-J */
+       movaps xmm2, [esp + rinvO]
+       mulps  xmm2, xmm2
+
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5 
+
+       movaps xmm1, xmm2
+       mulps  xmm1, xmm1
+       mulps  xmm1, xmm2       /* xmm1=rinvsix */
+       movaps xmm4, xmm1
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm3, xmm4
+       subps  xmm3, xmm1       /* xmm3=vnb12-vnb6 */
+       mulps  xmm1, [esp + six]
+       mulps  xmm4, [esp + twelve]
+       subps  xmm4, xmm1
+       addps  xmm3, [esp + vnbtot]
+       mulps  xmm4, [esp + rinvO]
+       mulps  xmm0, [esp + tsc]
+       subps  xmm4, xmm0
+       movaps [esp + vnbtot], xmm3
+       mulps  xmm4, [esp + rinvO]      
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4       /* tx in xmm0-xmm2 */
+
+       /* update O forces */
+       movaps xmm3, [esp + fixO]
+       movaps xmm4, [esp + fiyO]
+       movaps xmm7, [esp + fizO]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixO], xmm3
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm7
+       /* update j forces with water O */
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* Done with O interactions - now H1! */
+       movaps xmm7, [esp + rH1]
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm7, xmm0 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm7 fijC */
+        /* increment vcoul */
+       xorps  xmm4, xmm4
+        addps  xmm5, [esp + vctot]
+       mulps  xmm7, [esp + rinvH1]
+        movaps [esp + vctot], xmm5 
+       mulps  xmm7, [esp + tsc]
+       subps xmm4, xmm7
+
+       movaps xmm0, [esp + dxH1]
+       movaps xmm1, [esp + dyH1]
+       movaps xmm2, [esp + dzH1]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H1 forces */
+       movaps xmm3, [esp + fixH1]
+       movaps xmm4, [esp + fiyH1]
+       movaps xmm7, [esp + fizH1]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH1], xmm3
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm7
+       /* update j forces with water H1 */
+       addps  xmm0, [esp + fjx]
+       addps  xmm1, [esp + fjy]
+       addps  xmm2, [esp + fjz]
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* Done with H1, finally we do H2 interactions */
+       movaps xmm7, [esp + rH2]
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm7, xmm0 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul */
+       xorps  xmm4, xmm4
+        addps  xmm5, [esp + vctot]
+       mulps  xmm7, [esp + rinvH2]
+        movaps [esp + vctot], xmm5 
+       mulps  xmm7, [esp + tsc]
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dxH2]
+       movaps xmm1, [esp + dyH2]
+       movaps xmm2, [esp + dzH2]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+        movd eax, mm0   
+        movd ebx, mm1
+        movd ecx, mm2
+        movd edx, mm3
+       
+       /* update H2 forces */
+       movaps xmm3, [esp + fixH2]
+       movaps xmm4, [esp + fiyH2]
+       movaps xmm7, [esp + fizH2]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH2], xmm3
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm7
+
+       mov edi, [ebp +faction]
+       /* update j forces */
+       addps xmm0, [esp + fjx]
+       addps xmm1, [esp + fjy]
+       addps xmm2, [esp + fjz]
+
+       movlps xmm4, [edi + eax*4]
+       movlps xmm7, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm7, [edi + edx*4]
+       
+       movaps xmm3, xmm4
+       shufps xmm3, xmm7, 0b10001000
+       shufps xmm4, xmm7, 0b11011101                         
+       /* xmm3 has fjx, xmm4 has fjy */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       /* unpack them back for storing */
+       movaps xmm7, xmm3
+       unpcklps xmm7, xmm4
+       unpckhps xmm3, xmm4     
+       movlps [edi + eax*4], xmm7
+       movlps [edi + ecx*4], xmm3
+       movhps [edi + ebx*4], xmm7
+       movhps [edi + edx*4], xmm3
+       /* finally z forces */
+       movss  xmm0, [edi + eax*4 + 8]
+       movss  xmm1, [edi + ebx*4 + 8]
+       movss  xmm3, [edi + ecx*4 + 8]
+       movss  xmm4, [edi + edx*4 + 8]
+       subss  xmm0, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm1, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm3, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm4, xmm2
+       movss  [edi + eax*4 + 8], xmm0
+       movss  [edi + ebx*4 + 8], xmm1
+       movss  [edi + ecx*4 + 8], xmm3
+       movss  [edi + edx*4 + 8], xmm4
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3120_odd_inner
+       jmp   .i3120_unroll_loop
+.i3120_odd_inner:      
+       add   [esp + innerk],  4
+       jnz   .i3120_odd_loop
+       jmp   .i3120_updateouterdata
+.i3120_odd_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       xorps xmm4, xmm4
+       movss xmm4, [esp + iqO]
+       mov esi, [ebp + charge] 
+       movhps xmm4, [esp + iqH]     
+       movss xmm3, [esi + eax*4]       /* charge in xmm3 */
+       shufps xmm3, xmm3, 0
+       mulps xmm3, xmm4
+       movaps [esp + qqO], xmm3        /* use oxygen qq for storage */
+
+       xorps xmm6, xmm6
+       mov esi, [ebp + type]
+       mov ebx, [esi + eax*4]
+       mov esi, [ebp + nbfp]
+       shl ebx, 1      
+       add ebx, [esp + ntia]
+       movlps xmm6, [esi + ebx*4]
+       movaps xmm7, xmm6
+       shufps xmm6, xmm6, 0b11111100
+       shufps xmm7, xmm7, 0b11111101
+       movaps [esp + c6], xmm6
+       movaps [esp + c12], xmm7
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+       
+       /* move j coords to xmm0-xmm2 */
+       movss xmm0, [esi + eax*4]
+       movss xmm1, [esi + eax*4 + 4]
+       movss xmm2, [esi + eax*4 + 8]
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0    
+       movss xmm3, [esp + ixO]
+       movss xmm4, [esp + iyO]
+       movss xmm5, [esp + izO]
+               
+       movlps xmm6, [esp + ixH1]
+       movlps xmm7, [esp + ixH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm3, xmm6
+       movlps xmm6, [esp + iyH1]
+       movlps xmm7, [esp + iyH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm4, xmm6
+       movlps xmm6, [esp + izH1]
+       movlps xmm7, [esp + izH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm5, xmm6
+
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       
+       movaps [esp + dxO], xmm3
+       movaps [esp + dyO], xmm4
+       movaps [esp + dzO], xmm5
+
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       movaps [esp + rinvO], xmm0
+       
+       mulps xmm4, [esp + tsc]
+       movhlps xmm7, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm7    /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm7, mm7
+        movlhps xmm3, xmm7
+
+       subps   xmm4, xmm3      
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+       
+        movd mm0, eax   
+        movd mm1, ecx
+        movd mm2, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm0, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+
+       /* do nontable L-J */
+       movaps xmm2, [esp + rinvO]
+       mulps  xmm2, xmm2
+       movaps xmm1, xmm2
+       mulps  xmm1, xmm1
+       mulps  xmm1, xmm2       /* xmm1=rinvsix */
+       movaps xmm4, xmm1
+       mulps  xmm4, xmm4       /* xmm4=rinvtwelve */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm4, [esp + c12]
+       movaps xmm3, xmm4
+       subps  xmm3, xmm1       /* xmm3=vnb12-vnb6 */
+       mulps  xmm1, [esp + six]
+       mulps  xmm4, [esp + twelve]
+       subps  xmm4, xmm1
+       addps  xmm3, [esp + vnbtot]
+       mulps  xmm4, [esp + rinvO]
+       mulps  xmm0, [esp + tsc]
+       subps  xmm4, xmm0
+       movaps [esp + vnbtot], xmm3
+       mulps  xmm4, [esp + rinvO]      
+               
+        movd eax, mm0   
+        movd ecx, mm1
+        movd edx, mm2  
+               
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4 /* xmm0-xmm2 now contains tx-tz (partial force) */
+       movss  xmm3, [esp + fixO]       
+       movss  xmm4, [esp + fiyO]       
+       movss  xmm5, [esp + fizO]       
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esp + fixO], xmm3       
+       movss  [esp + fiyO], xmm4       
+       movss  [esp + fizO], xmm5       /* updated the O force now do the H's */
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       shufps xmm3, xmm3, 0b11100110   /* shift right */
+       shufps xmm4, xmm4, 0b11100110
+       shufps xmm5, xmm5, 0b11100110
+       addss  xmm3, [esp + fixH1]
+       addss  xmm4, [esp + fiyH1]
+       addss  xmm5, [esp + fizH1]
+       movss  [esp + fixH1], xmm3      
+       movss  [esp + fiyH1], xmm4      
+       movss  [esp + fizH1], xmm5      /* updated the H1 force */
+
+       mov edi, [ebp + faction]
+       shufps xmm3, xmm3, 0b11100111   /* shift right */
+       shufps xmm4, xmm4, 0b11100111
+       shufps xmm5, xmm5, 0b11100111
+       addss  xmm3, [esp + fixH2]
+       addss  xmm4, [esp + fiyH2]
+       addss  xmm5, [esp + fizH2]
+       movss  [esp + fixH2], xmm3      
+       movss  [esp + fiyH2], xmm4      
+       movss  [esp + fizH2], xmm5      /* updated the H2 force */
+
+       /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+       xorps  xmm5, xmm5
+       movaps xmm3, xmm0
+       movlps xmm6, [edi + eax*4]
+       movss  xmm7, [edi + eax*4 + 8]
+       unpcklps xmm3, xmm1
+       movlhps  xmm3, xmm5     
+       unpckhps xmm0, xmm1             
+       addps    xmm0, xmm3
+       movhlps  xmm3, xmm0     
+       addps    xmm0, xmm3     /* x,y sum in xmm0 */
+
+       movhlps  xmm1, xmm2
+       addss    xmm2, xmm1
+       shufps   xmm1, xmm1, 1 
+       addss    xmm2, xmm1    /* z sum in xmm2 */
+       subps    xmm6, xmm0
+       subss    xmm7, xmm2
+       
+       movlps [edi + eax*4],     xmm6
+       movss  [edi + eax*4 + 8], xmm7
+
+       dec dword ptr [esp + innerk]
+       jz    .i3120_updateouterdata
+       jmp   .i3120_odd_loop
+.i3120_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO]
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       mov   edx, [ebp + gid]  
+       mov   edx, [edx]
+       add   [ebp + gid],  4   
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+        
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3120_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3120_outer
+.i3120_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 824
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+       
+
+       
+.globl inl3130_sse
+       .type inl3130_sse,@function
+inl3130_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76      
+.equ           VFtab,          80
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           jxO,            144
+.equ           jyO,            160
+.equ           jzO,            176
+.equ           jxH1,           192
+.equ           jyH1,           208
+.equ           jzH1,           224 
+.equ           jxH2,           240
+.equ           jyH2,           256
+.equ           jzH2,           272
+.equ           dxOO,           288
+.equ           dyOO,           304
+.equ           dzOO,           320     
+.equ           dxOH1,          336
+.equ           dyOH1,          352
+.equ           dzOH1,          368     
+.equ           dxOH2,          384
+.equ           dyOH2,          400
+.equ           dzOH2,          416     
+.equ           dxH1O,          432
+.equ           dyH1O,          448
+.equ           dzH1O,          464     
+.equ           dxH1H1,         480
+.equ           dyH1H1,         496
+.equ           dzH1H1,         512     
+.equ           dxH1H2,         528
+.equ           dyH1H2,         544
+.equ           dzH1H2,         560     
+.equ           dxH2O,          576
+.equ           dyH2O,          592
+.equ           dzH2O,          608     
+.equ           dxH2H1,         624
+.equ           dyH2H1,         640
+.equ           dzH2H1,         656     
+.equ           dxH2H2,         672
+.equ           dyH2H2,         688
+.equ           dzH2H2,         704
+.equ           qqOO,           720
+.equ           qqOH,           736
+.equ           qqHH,           752
+.equ           two,            768
+.equ           tsc,            784
+.equ           c6,             800
+.equ           c12,            816              
+.equ           six,            832
+.equ           twelve,         848              
+.equ           vctot,          864
+.equ           vnbtot,         880
+.equ           fixO,           896
+.equ           fiyO,           912
+.equ           fizO,           928
+.equ           fixH1,          944
+.equ           fiyH1,          960
+.equ           fizH1,          976
+.equ           fixH2,          992
+.equ           fiyH2,         1008
+.equ           fizH2,         1024
+.equ           fjxO,          1040
+.equ           fjyO,          1056
+.equ           fjzO,          1072
+.equ           fjxH1,         1088
+.equ           fjyH1,         1104
+.equ           fjzH1,         1120
+.equ           fjxH2,         1136
+.equ           fjyH2,         1152
+.equ           fjzH2,         1168
+.equ           half,          1184
+.equ           three,         1200
+.equ           rsqOO,         1216
+.equ           rsqOH1,        1232
+.equ           rsqOH2,        1248
+.equ           rsqH1O,        1264
+.equ           rsqH1H1,       1280
+.equ           rsqH1H2,       1296
+.equ           rsqH2O,        1312
+.equ           rsqH2H1,       1328
+.equ           rsqH2H2,       1344
+.equ           rinvOO,        1360
+.equ           rinvOH1,       1376
+.equ           rinvOH2,       1392
+.equ           rinvH1O,       1408
+.equ           rinvH1H1,      1424
+.equ           rinvH1H2,      1440
+.equ           rinvH2O,       1456
+.equ           rinvH2H1,      1472
+.equ           rinvH2H2,      1488
+.equ           fstmp,         1504     
+.equ           is3,           1520
+.equ           ii3,           1524
+.equ           innerjjnr,     1528
+.equ           innerk,        1532
+.equ           salign,        1536                                                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 1540           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movups xmm3, [sse_six]
+       movups xmm4, [sse_twelve]
+       movss xmm5, [ebp +tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two],  xmm1
+       movaps [esp + three], xmm2
+       movaps [esp + six], xmm3
+       movaps [esp + twelve], xmm4
+       shufps xmm5, xmm5, 0
+       movaps [esp + tsc],  xmm5
+
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, xmm3        
+       movss xmm5, [edx + ebx*4 + 4]   
+       movss xmm6, [ebp + facel]
+       mulss  xmm3, xmm3
+       mulss  xmm4, xmm5
+       mulss  xmm5, xmm5
+       mulss  xmm3, xmm6
+       mulss  xmm4, xmm6
+       mulss  xmm5, xmm6
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + qqOO], xmm3
+       movaps [esp + qqOH], xmm4
+       movaps [esp + qqHH], xmm5
+               
+       xorps xmm0, xmm0
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       mov   edx, ecx
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       add   edx, ecx
+       mov   eax, [ebp + nbfp]
+       movlps xmm0, [eax + edx*4] 
+       movaps xmm1, xmm0
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0b01010101
+       movaps [esp + c6], xmm0
+       movaps [esp + c12], xmm1
+
+.i3130_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx  
+       
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3130_unroll_loop
+       jmp   .i3130_single_check
+.i3130_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4] 
+       mov   ecx, [edx + 8]
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+       
+       /* move j coordinates to local temp variables */
+       movlps xmm2, [esi + eax*4]
+       movlps xmm3, [esi + eax*4 + 12]
+       movlps xmm4, [esi + eax*4 + 24]
+
+       movlps xmm5, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 12]
+       movlps xmm7, [esi + ebx*4 + 24]
+
+       movhps xmm2, [esi + ecx*4]
+       movhps xmm3, [esi + ecx*4 + 12]
+       movhps xmm4, [esi + ecx*4 + 24]
+
+       movhps xmm5, [esi + edx*4]
+       movhps xmm6, [esi + edx*4 + 12]
+       movhps xmm7, [esi + edx*4 + 24]
+
+       /* current state: */    
+       /* xmm2= jxOa  jyOa  jxOc  jyOc */
+       /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+       /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+       /* xmm5= jxOb  jyOb  jxOd  jyOd */
+       /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+       /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+       
+       movaps xmm0, xmm2
+       movaps xmm1, xmm3
+       unpcklps xmm0, xmm5     /* xmm0= jxOa  jxOb  jyOa  jyOb */
+       unpcklps xmm1, xmm6     /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+       unpckhps xmm2, xmm5     /* xmm2= jxOc  jxOd  jyOc  jyOd */
+       unpckhps xmm3, xmm6     /* xmm3= jxH1c jxH1d jyH1c jyH1d */
+       movaps xmm5, xmm4
+       movaps   xmm6, xmm0
+       unpcklps xmm4, xmm7     /* xmm4= jxH2a jxH2b jyH2a jyH2b */             
+       unpckhps xmm5, xmm7     /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+       movaps   xmm7, xmm1
+       movlhps  xmm0, xmm2     /* xmm0= jxOa  jxOb  jxOc  jxOd */
+       movaps [esp + jxO], xmm0
+       movhlps  xmm2, xmm6     /* xmm2= jyOa  jyOb  jyOc  jyOd */
+       movaps [esp + jyO], xmm2
+       movlhps  xmm1, xmm3
+       movaps [esp + jxH1], xmm1
+       movhlps  xmm3, xmm7
+       movaps   xmm6, xmm4
+       movaps [esp + jyH1], xmm3
+       movlhps  xmm4, xmm5
+       movaps [esp + jxH2], xmm4
+       movhlps  xmm5, xmm6
+       movaps [esp + jyH2], xmm5
+
+       movss  xmm0, [esi + eax*4 + 8]
+       movss  xmm1, [esi + eax*4 + 20]
+       movss  xmm2, [esi + eax*4 + 32]
+
+       movss  xmm3, [esi + ecx*4 + 8]
+       movss  xmm4, [esi + ecx*4 + 20]
+       movss  xmm5, [esi + ecx*4 + 32]
+
+       movhps xmm0, [esi + ebx*4 + 4]
+       movhps xmm1, [esi + ebx*4 + 16]
+       movhps xmm2, [esi + ebx*4 + 28]
+       
+       movhps xmm3, [esi + edx*4 + 4]
+       movhps xmm4, [esi + edx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 28]
+       
+       shufps xmm0, xmm3, 0b11001100
+       shufps xmm1, xmm4, 0b11001100
+       shufps xmm2, xmm5, 0b11001100
+       movaps [esp + jzO],  xmm0
+       movaps [esp + jzH1],  xmm1
+       movaps [esp + jzH2],  xmm2
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixO]
+       movaps xmm4, [esp + iyO]
+       movaps xmm5, [esp + izO]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxOH1], xmm3
+       movaps [esp + dyOH1], xmm4
+       movaps [esp + dzOH1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOO], xmm0
+       movaps [esp + rsqOH1], xmm3
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       subps  xmm3, [esp + jxO]
+       subps  xmm4, [esp + jyO]
+       subps  xmm5, [esp + jzO]
+       movaps [esp + dxOH2], xmm0
+       movaps [esp + dyOH2], xmm1
+       movaps [esp + dzOH2], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1O], xmm3
+       movaps [esp + dyH1O], xmm4
+       movaps [esp + dzH1O], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOH2], xmm0
+       movaps [esp + rsqH1O], xmm3
+
+       movaps xmm0, [esp + ixH1]
+       movaps xmm1, [esp + iyH1]
+       movaps xmm2, [esp + izH1]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH1]
+       subps  xmm1, [esp + jyH1]
+       subps  xmm2, [esp + jzH1]
+       subps  xmm3, [esp + jxH2]
+       subps  xmm4, [esp + jyH2]
+       subps  xmm5, [esp + jzH2]
+       movaps [esp + dxH1H1], xmm0
+       movaps [esp + dyH1H1], xmm1
+       movaps [esp + dzH1H1], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1H2], xmm3
+       movaps [esp + dyH1H2], xmm4
+       movaps [esp + dzH1H2], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqH1H1], xmm0
+       movaps [esp + rsqH1H2], xmm3
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxH2O], xmm0
+       movaps [esp + dyH2O], xmm1
+       movaps [esp + dzH2O], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH2H1], xmm3
+       movaps [esp + dyH2H1], xmm4
+       movaps [esp + dzH2H1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       movaps [esp + rsqH2O], xmm0
+       movaps [esp + rsqH2H1], xmm4
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       movaps [esp + dxH2H2], xmm0
+       movaps [esp + dyH2H2], xmm1
+       movaps [esp + dzH2H2], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2
+       movaps [esp + rsqH2H2], xmm0
+               
+       /* start doing invsqrt use rsq values in xmm0, xmm4 */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinvH2H2 */
+       mulps   xmm7, [esp + half] /* rinvH2H1 */
+       movaps  [esp + rinvH2H2], xmm3
+       movaps  [esp + rinvH2H1], xmm7
+               
+       rsqrtps xmm1, [esp + rsqOO]
+       rsqrtps xmm5, [esp + rsqOH1]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOO]
+       mulps   xmm5, [esp + rsqOH1]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOO], xmm3
+       movaps  [esp + rinvOH1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOH2]
+       rsqrtps xmm5, [esp + rsqH1O]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOH2]
+       mulps   xmm5, [esp + rsqH1O]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOH2], xmm3
+       movaps  [esp + rinvH1O], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH1H1]
+       rsqrtps xmm5, [esp + rsqH1H2]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqH1H1]
+       mulps   xmm5, [esp + rsqH1H2]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvH1H1], xmm3
+       movaps  [esp + rinvH1H2], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH2O]
+       movaps  xmm2, xmm1
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, [esp + rsqH2O]
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2
+       mulps   xmm3, [esp + half] 
+       movaps  [esp + rinvH2O], xmm3
+
+       /* start with OO interaction */
+       movaps xmm0, [esp + rinvOO]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOO] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+               
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+       
+        movd mm0, eax
+        movd mm1, ebx
+        movd mm2, ecx
+        movd mm3, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        /* update vctot */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       mulps  xmm3, [esp + tsc]
+       
+       /* start doing lj */
+       movaps xmm2, xmm0
+       mulps  xmm2, xmm2
+       movaps xmm1, xmm2
+       mulps  xmm1, xmm2
+       mulps  xmm1, xmm2       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulps  xmm1, [esp + c6]
+       mulps  xmm2, [esp + c12]
+       movaps xmm4, xmm2
+       subps  xmm4, xmm1
+       addps  xmm4, [esp + vnbtot]
+       mulps  xmm1, [esp + six]
+       mulps  xmm2, [esp + twelve]
+       movaps [esp + vnbtot], xmm4
+       subps  xmm2, xmm1
+       mulps  xmm2, xmm0
+
+       subps  xmm2, xmm3
+       mulps  xmm0, xmm2
+       
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0               
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOO]
+       mulps xmm1, [esp + dyOO]
+       mulps xmm2, [esp + dzOO]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H1 interaction */
+       movaps xmm0, [esp + rinvOH1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOH1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+       
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH1]
+       mulps xmm1, [esp + dyOH1]
+       mulps xmm2, [esp + dzOH1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H2 interaction */ 
+       movaps xmm0, [esp + rinvOH2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOH2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH2]
+       mulps xmm1, [esp + dyOH2]
+       mulps xmm2, [esp + dzOH2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* H1-O interaction */
+       movaps xmm0, [esp + rinvH1O]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1O] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH1O]
+       mulps xmm1, [esp + dyH1O]
+       mulps xmm2, [esp + dzH1O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H1 interaction */
+       movaps xmm0, [esp + rinvH1H1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1H1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH1H1]
+       mulps xmm1, [esp + dyH1H1]
+       mulps xmm2, [esp + dzH1H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H2 interaction */
+       movaps xmm0, [esp + rinvH1H2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1H2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH1H2]
+       mulps xmm1, [esp + dyH1H2]
+       mulps xmm2, [esp + dzH1H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H2-O interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2O] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH2O]
+       mulps xmm1, [esp + dyH2O]
+       mulps xmm2, [esp + dzH2O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H1 interaction */
+       movaps xmm0, [esp + rinvH2H1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2H1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH2H1]
+       mulps xmm1, [esp + dyH2H1]
+       mulps xmm2, [esp + dzH2H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H2 interaction */
+       movaps xmm0, [esp + rinvH2H2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2H2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH2H2]
+       mulps xmm1, [esp + dyH2H2]
+       mulps xmm2, [esp + dzH2H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       mov edi, [ebp +faction]
+
+       movd eax, mm0
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+       
+       /* Did all interactions - now update j forces */
+       /* 4 j waters with three atoms each - first do a & b j particles */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpcklps xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjxOb  fjyOb */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOb  fjyOb */ 
+       unpcklps xmm1, xmm2        /* xmm1= fjzOa  fjxH1a fjzOb  fjxH1b */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpcklps xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1b fjzH1b */
+       unpcklps xmm5, xmm6        /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjzOa  fjxH1a */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOb  fjyOb  fjzOb  fjxH1b */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+       movups   xmm1, [edi + eax*4]
+       movups   xmm2, [edi + eax*4 + 16]
+       movups   xmm5, [edi + ebx*4]
+       movups   xmm6, [edi + ebx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + eax*4 + 32]
+       movss    xmm3, [edi + ebx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm7, xmm7, 1
+       
+       movups   [edi + eax*4],     xmm1
+       movups   [edi + eax*4 + 16],xmm2
+       movups   [edi + ebx*4],     xmm5
+       movups   [edi + ebx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + eax*4 + 32], xmm0
+       movss    [edi + ebx*4 + 32], xmm3       
+
+       /* then do the second pair (c & d) */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpckhps xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjxOd  fjyOd */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOd  fjyOd */ 
+       unpckhps xmm1, xmm2        /* xmm1= fjzOc  fjxH1c fjzOd  fjxH1d */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpckhps xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1d fjzH1d */     
+       unpckhps xmm5, xmm6        /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjzOc  fjxH1c */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOd  fjyOd  fjzOd  fjxH1d */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c  */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+       movups   xmm1, [edi + ecx*4]
+       movups   xmm2, [edi + ecx*4 + 16]
+       movups   xmm5, [edi + edx*4]
+       movups   xmm6, [edi + edx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + ecx*4 + 32]
+       movss    xmm3, [edi + edx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm4, xmm4, 0b10
+       shufps   xmm7, xmm7, 0b11
+       movups   [edi + ecx*4],     xmm1
+       movups   [edi + ecx*4 + 16],xmm2
+       movups   [edi + edx*4],     xmm5
+       movups   [edi + edx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + ecx*4 + 32], xmm0
+       movss    [edi + edx*4 + 32], xmm3       
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3130_single_check
+       jmp   .i3130_unroll_loop
+.i3130_single_check:
+       add   [esp + innerk],  4
+       jnz   .i3130_single_loop
+       jmp   .i3130_updateouterdata
+.i3130_single_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+
+       /* fetch j coordinates */
+       xorps xmm3, xmm3
+       xorps xmm4, xmm4
+       xorps xmm5, xmm5
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + eax*4 + 4]
+       movss xmm5, [esi + eax*4 + 8]
+
+       movlps xmm6, [esi + eax*4 + 12]
+       movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+       /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+       movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+       shufps xmm6, xmm6, 0b11011000    /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+       movlhps xmm3, xmm6              /* xmm3= jxO   0  jxH1 jxH2 */
+       movaps  xmm0, [esp + ixO]     
+       movaps  xmm1, [esp + iyO]
+       movaps  xmm2, [esp + izO]       
+       shufps  xmm4, xmm6, 0b11100100 /* xmm4= jyO   0   jyH1 jyH2 */
+       shufps xmm5, xmm7, 0b11000100  /* xmm5= jzO   0   jzH1 jzH2 */
+       /* store all j coordinates in jO */ 
+       movaps [esp + jxO], xmm3
+       movaps [esp + jyO], xmm4
+       movaps [esp + jzO], xmm5
+       subps  xmm0, xmm3
+       subps  xmm1, xmm4
+       subps  xmm2, xmm5
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2        /* have rsq in xmm0 */
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       movaps  xmm2, xmm1      
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, xmm0
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2                                                      
+       mulps   xmm3, [esp + half] /* rinv iO - j water */
+
+       movaps  xmm1, xmm3
+       mulps   xmm1, xmm0      /* xmm1=r */
+       movaps  xmm0, xmm3      /* xmm0=rinv */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+       
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+       mov esi, [ebp + VFtab]
+       
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOO]
+       movhps  xmm3, [esp + qqOH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+       
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+
+       mulps  xmm3, [esp + tsc]
+       
+       /* start doing lj */
+       xorps  xmm2, xmm2
+       movss  xmm2, xmm0
+       mulss  xmm2, xmm2
+       movaps xmm1, xmm2
+       mulss  xmm1, xmm2
+       mulss  xmm1, xmm2       /* xmm1=rinvsix */
+       movaps xmm2, xmm1
+       mulss  xmm2, xmm2       /* xmm2=rinvtwelve */
+       mulss  xmm1, [esp + c6]
+       mulss  xmm2, [esp + c12]
+       movaps xmm4, xmm2
+       subss  xmm4, xmm1
+       addps  xmm4, [esp + vnbtot]
+       mulss  xmm1, [esp + six]
+       mulss  xmm2, [esp + twelve]
+       movaps [esp + vnbtot], xmm4
+       subss  xmm2, xmm1
+       mulss  xmm2, xmm0
+
+       subps  xmm2, xmm3
+       mulps  xmm0, xmm2
+       
+       movaps xmm1, xmm0
+       movaps xmm2, xmm0                       
+
+       mulps   xmm0, [esp + dxOO]
+       mulps   xmm1, [esp + dyOO]
+       mulps   xmm2, [esp + dzOO]
+       /* initial update for j forces */
+       xorps   xmm3, xmm3
+       xorps   xmm4, xmm4
+       xorps   xmm5, xmm5
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixO]
+       addps   xmm1, [esp + fiyO]
+       addps   xmm2, [esp + fizO]
+       movaps  [esp + fixO], xmm0
+       movaps  [esp + fiyO], xmm1
+       movaps  [esp + fizO], xmm2
+
+       
+       /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+       movaps  xmm0, [esp + ixH1]
+       movaps  xmm1, [esp + iyH1]
+       movaps  xmm2, [esp + izH1]      
+       movaps  xmm3, [esp + ixH2] 
+       movaps  xmm4, [esp + iyH2] 
+       movaps  xmm5, [esp + izH2] 
+       subps   xmm0, [esp + jxO]
+       subps   xmm1, [esp + jyO]
+       subps   xmm2, [esp + jzO]
+       subps   xmm3, [esp + jxO]
+       subps   xmm4, [esp + jyO]
+       subps   xmm5, [esp + jzO]
+       movaps [esp + dxH1O], xmm0
+       movaps [esp + dyH1O], xmm1
+       movaps [esp + dzH1O], xmm2
+       movaps [esp + dxH2O], xmm3
+       movaps [esp + dyH2O], xmm4
+       movaps [esp + dzH2O], xmm5
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       mulps xmm3, xmm3
+       mulps xmm4, xmm4
+       mulps xmm5, xmm5
+       addps xmm0, xmm1
+       addps xmm4, xmm3
+       addps xmm0, xmm2        /* have rsqH1 in xmm0 */
+       addps xmm4, xmm5        /* have rsqH2 in xmm4 */
+
+       /* start with H1, save H2 data */
+       movaps [esp + rsqH2O], xmm4
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinv H1 - j water */
+       mulps   xmm7, [esp + half] /* rinv H2 - j water */ 
+
+       /* start with H1, save H2 data */
+       movaps [esp + rinvH2O], xmm7
+
+       movaps xmm1, xmm3
+       mulps  xmm1, xmm0       /* xmm1=r */
+       movaps xmm0, xmm3       /* xmm0=rinv */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOH]
+       movhps  xmm3, [esp + qqHH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5     
+
+        xorps  xmm1, xmm1
+
+        mulps xmm3, [esp + tsc]
+        mulps xmm3, xmm0
+        subps  xmm1, xmm3
+       
+       movaps  xmm0, xmm1
+       movaps  xmm2, xmm1
+       mulps   xmm0, [esp + dxH1O]
+       mulps   xmm1, [esp + dyH1O]
+       mulps   xmm2, [esp + dzH1O]
+       /* update forces H1 - j water */
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH1]
+       addps   xmm1, [esp + fiyH1]
+       addps   xmm2, [esp + fizH1]
+       movaps  [esp + fixH1], xmm0
+       movaps  [esp + fiyH1], xmm1
+       movaps  [esp + fizH1], xmm2
+       /* do table for H2 - j water interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, [esp + rsqH2O]
+       mulps  xmm1, xmm0       /* xmm0=rinv, xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld   mm6, 2
+       pslld   mm7, 2
+
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOH]
+       movhps  xmm3, [esp + qqHH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5     
+
+        xorps  xmm1, xmm1
+
+        mulps xmm3, [esp + tsc]
+        mulps xmm3, xmm0
+        subps  xmm1, xmm3
+       
+       movaps  xmm0, xmm1
+       movaps  xmm2, xmm1
+       
+       mulps   xmm0, [esp + dxH2O]
+       mulps   xmm1, [esp + dyH2O]
+       mulps   xmm2, [esp + dzH2O]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       mov     esi, [ebp + faction]
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH2]
+       addps   xmm1, [esp + fiyH2]
+       addps   xmm2, [esp + fizH2]
+       movaps  [esp + fixH2], xmm0
+       movaps  [esp + fiyH2], xmm1
+       movaps  [esp + fizH2], xmm2
+
+       /* update j water forces from local variables */
+       movlps  xmm0, [esi + eax*4]
+       movlps  xmm1, [esi + eax*4 + 12]
+       movhps  xmm1, [esi + eax*4 + 24]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       movaps  xmm6, xmm5
+       movaps  xmm7, xmm5
+       shufps  xmm6, xmm6, 0b10
+       shufps  xmm7, xmm7, 0b11
+       addss   xmm5, [esi + eax*4 + 8]
+       addss   xmm6, [esi + eax*4 + 20]
+       addss   xmm7, [esi + eax*4 + 32]
+       movss   [esi + eax*4 + 8], xmm5
+       movss   [esi + eax*4 + 20], xmm6
+       movss   [esi + eax*4 + 32], xmm7
+       movaps   xmm5, xmm3
+       unpcklps xmm3, xmm4
+       unpckhps xmm5, xmm4
+       addps    xmm0, xmm3
+       addps    xmm1, xmm5
+       movlps  [esi + eax*4], xmm0 
+       movlps  [esi + eax*4 + 12], xmm1 
+       movhps  [esi + eax*4 + 24], xmm1 
+       
+       dec dword ptr [esp + innerk]
+       jz    .i3130_updateouterdata
+       jmp   .i3130_single_loop
+.i3130_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO] 
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3130_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3130_outer
+.i3130_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 1540
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+       
+
+
+.globl inl3300_sse
+       .type inl3300_sse,@function
+inl3300_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,              0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96
+.equ           two,            112
+.equ           tsc,            128
+.equ           qq,             144     
+.equ           c6,             160
+.equ           c12,            176
+.equ           fscal,          192
+.equ           vctot,          208
+.equ           vnbtot,         224
+.equ           fix,            240
+.equ           fiy,            256
+.equ           fiz,            272
+.equ           half,           288
+.equ           three,          304
+.equ           is3,            320
+.equ           ii3,            324
+.equ           ntia,           328     
+.equ           innerjjnr,      332
+.equ           innerk,         336
+.equ           salign,         340                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 344            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three],  xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+.i3300_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3300_unroll_loop
+       jmp   .i3300_finish_inner
+.i3300_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       mulps  xmm3, xmm2
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps [esp + qq], xmm3
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       lea   ecx, [ecx + ecx*2]
+       lea   edx, [edx + edx*2]
+               
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       /* put scalar force on stack temporarily */
+       movaps [esp + fscal], xmm3
+
+       /* dispersion */
+       movlps xmm5, [esi + eax*4 + 16]
+       movlps xmm7, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + ebx*4 + 16]
+       movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + eax*4 + 24]
+       movlps xmm3, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + ebx*4 + 24]
+       movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+       addps  xmm7, [esp + fscal] /* add to fscal */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + eax*4 + 32]
+       movlps xmm7, [esi + ecx*4 + 32]
+       movhps xmm5, [esi + ebx*4 + 32]
+       movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 40]
+       movlps xmm3, [esi + ecx*4 + 40]
+       movhps xmm7, [esi + ebx*4 + 40]
+       movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3300_finish_inner
+       jmp   .i3300_unroll_loop
+.i3300_finish_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3300_dopair
+       jmp   .i3300_checksingle
+.i3300_dopair: 
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+       lea   ecx, [ecx + ecx*2]
+       lea   edx, [edx + edx*2]
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       /* put scalar force on stack temporarily */
+       movaps [esp + fscal], xmm3
+
+       /* dispersion */
+       movlps xmm5, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 16]/* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + edx*4 + 24] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+       addps  xmm7, [esp + fscal] /* add to fscal */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + ecx*4 + 32]
+       movhps xmm5, [esi + edx*4 + 32] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + ecx*4 + 40]
+       movhps xmm7, [esi + edx*4 + 40] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3300_checksingle:                            
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3300_dosingle
+       jmp    .i3300_updateouterdata
+.i3300_dosingle:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       lea  ebx, [ebx + ebx*2]
+                                               
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       /* put scalar force on stack temporarily */
+       movaps [esp + fscal], xmm3
+
+       /* dispersion */
+       movlps xmm4, [esi + ebx*4 + 16]
+       movlps xmm6, [esi + ebx*4 + 24]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+       addps  xmm7, [esp + fscal] /* add to fscal */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm4, [esi + ebx*4 + 32]
+       movlps xmm6, [esi + ebx*4 + 40]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3300_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3300_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3300_outer
+.i3300_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 344
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+
+
+.globl inl3310_sse
+       .type inl3310_sse,@function
+inl3310_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+.equ           nsatoms,        84                      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ix,               0
+.equ           iy,              16
+.equ           iz,              32
+.equ           iq,              48
+.equ           dx,              64
+.equ           dy,              80
+.equ           dz,              96
+.equ           two,            112
+.equ           tsc,            128
+.equ           qq,             144     
+.equ           c6,             160
+.equ           c12,            176
+.equ           fscal,          192
+.equ           vctot,          208
+.equ           vnbtot,         224
+.equ           fix,            240
+.equ           fiy,            256
+.equ           fiz,            272
+.equ           half,           288
+.equ           three,          304
+.equ           is3,            320
+.equ           ii3,            324
+.equ           shX,            328
+.equ           shY,            332
+.equ           shZ,            336
+.equ           ntia,           340     
+.equ           innerjjnr0,     344
+.equ           innerk0,        348     
+.equ           innerjjnr,      352
+.equ           innerk,         356
+.equ           salign,         360                                                     
+.equ           nsvdwc,         364
+.equ           nscoul,         368
+.equ           nsvdw,          372
+.equ           solnr,          376             
+       push ebp
+       mov ebp,esp     
+       push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 380            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp + tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two], xmm1
+       movaps [esp + three], xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc], xmm3
+
+       /* assume we have at least one i particle - start directly */   
+.i3310_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movlps xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 8] 
+       movlps [esp + shX], xmm0
+       movss [esp + shZ], xmm1
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   eax, [ebp + nsatoms]
+       add   [ebp + nsatoms],  12
+       mov   ecx, [eax]        
+       mov   edx, [eax + 4]
+       mov   eax, [eax + 8]    
+       sub   ecx, eax
+       sub   eax, edx
+       
+       mov   [esp + nsvdwc], edx
+       mov   [esp + nscoul], eax
+       mov   [esp + nsvdw], ecx
+               
+       /* clear potential */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       mov   [esp + solnr],  ebx
+
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr0], eax     /* pointer to jjnr[nj0] */
+       mov   [esp + innerk0], edx        /* number of innerloop atoms */
+
+       mov   ecx, [esp + nsvdwc]
+       cmp   ecx,  0
+       jnz   .i3310_mno_vdwc
+       jmp   .i3310_testcoul
+.i3310_mno_vdwc:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       /* clear i forces */
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+       
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3310_unroll_vdwc_loop
+       jmp   .i3310_finish_vdwc_inner
+.i3310_unroll_vdwc_loop:       
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       mulps  xmm3, xmm2
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps [esp + qq], xmm3
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       lea   ecx, [ecx + ecx*2]
+       lea   edx, [edx + edx*2]
+               
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       /* put scalar force on stack temporarily */
+       movaps [esp + fscal], xmm3
+
+       /* dispersion */
+       movlps xmm5, [esi + eax*4 + 16]
+       movlps xmm7, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + ebx*4 + 16]
+       movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + eax*4 + 24]
+       movlps xmm3, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + ebx*4 + 24]
+       movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+       addps  xmm7, [esp + fscal] /* add to fscal */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + eax*4 + 32]
+       movlps xmm7, [esi + ecx*4 + 32]
+       movhps xmm5, [esi + ebx*4 + 32]
+       movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 40]
+       movlps xmm3, [esi + ecx*4 + 40]
+       movhps xmm7, [esi + ebx*4 + 40]
+       movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3310_finish_vdwc_inner
+       jmp   .i3310_unroll_vdwc_loop
+.i3310_finish_vdwc_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3310_dopair_vdwc
+       jmp   .i3310_checksingle_vdwc
+.i3310_dopair_vdwc:    
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+       lea   ecx, [ecx + ecx*2]
+       lea   edx, [edx + edx*2]
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       /* put scalar force on stack temporarily */
+       movaps [esp + fscal], xmm3
+
+       /* dispersion */
+       movlps xmm5, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 16]/* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + edx*4 + 24] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+       addps  xmm7, [esp + fscal] /* add to fscal */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + ecx*4 + 32]
+       movhps xmm5, [esi + edx*4 + 32] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+
+       movlps xmm7, [esi + ecx*4 + 40]
+       movhps xmm7, [esi + edx*4 + 40] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3310_checksingle_vdwc:                               
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3310_dosingle_vdwc
+       jmp    .i3310_updateouterdata_vdwc
+.i3310_dosingle_vdwc:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       lea  ebx, [ebx + ebx*2]
+                                               
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       /* put scalar force on stack temporarily */
+       movaps [esp + fscal], xmm3
+
+       /* dispersion */
+       movlps xmm4, [esi + ebx*4 + 16]
+       movlps xmm6, [esi + ebx*4 + 24]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+       addps  xmm7, [esp + fscal] /* add to fscal */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm4, [esi + ebx*4 + 32]
+       movlps xmm6, [esi + ebx*4 + 40]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3310_updateouterdata_vdwc:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+
+       /* loop back to mno */
+       dec  dword ptr [esp + nsvdwc]
+       jz  .i3310_testcoul
+       jmp .i3310_mno_vdwc
+.i3310_testcoul:
+       mov  ecx, [esp + nscoul]
+       cmp  ecx,  0
+       jnz  .i3310_mno_coul
+       jmp  .i3310_testvdw
+.i3310_mno_coul:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       mulss xmm3, [ebp + facel]
+       shufps xmm3, xmm3, 0
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+
+       movaps [esp + iq], xmm3
+       
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   [esp + ii3], ebx
+       
+       /* clear i forces */
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3310_unroll_coul_loop
+       jmp   .i3310_finish_coul_inner
+
+.i3310_unroll_coul_loop:       
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       movaps xmm2, [esp + iq]
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       mulps  xmm3, xmm2
+
+       movaps [esp + qq], xmm3 
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       lea   ecx, [ecx + ecx*2]
+       lea   edx, [edx + edx*2]
+               
+       movlps xmm5, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm5, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* coulomb table ready, in xmm4-xmm7 */         
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3310_finish_coul_inner
+       jmp   .i3310_unroll_coul_loop
+.i3310_finish_coul_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3310_dopair_coul
+       jmp   .i3310_checksingle_coul
+.i3310_dopair_coul:    
+       mov esi, [ebp + charge]
+
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+       movss xmm3, [esi + eax*4]               
+       movss xmm6, [esi + ebx*4]
+       shufps xmm3, xmm6, 0 
+       shufps xmm3, xmm3, 0b00001000 /* xmm3(0,1) has the charges */
+
+       mulps  xmm3, [esp + iq]
+       movlhps xmm3, xmm7
+       movaps [esp + qq], xmm3
+
+       mov edi, [ebp + pos]    
+       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       lea   ecx, [ecx + ecx*2]
+       lea   edx, [edx + edx*2]
+
+       movlps xmm5, [esi + ecx*4]
+       movhps xmm5, [esi + edx*4] /* got half coulomb table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8]
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps  xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3310_checksingle_coul:                               
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3310_dosingle_coul
+       jmp    .i3310_updateouterdata_coul
+.i3310_dosingle_coul:
+       mov esi, [ebp + charge]
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+       movss xmm6, [esi + eax*4]       /* xmm6(0) has the charge */    
+       mulps  xmm6, [esp + iq]
+       movaps [esp + qq], xmm6
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+       
+       lea   ebx, [ebx + ebx*2]
+
+       movlps xmm4, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       movaps xmm3, [esp + qq]
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+       mulps  xmm3, xmm7 /* fijC=FF*qq */
+       /* at this point mm5 contains vcoul and mm3 fijC */
+       /* increment vcoul - then we can get rid of mm5 */
+       /* update vctot */
+       addps  xmm5, [esp + vctot]
+       movaps [esp + vctot], xmm5 
+
+       xorps xmm4, xmm4
+
+       mulps xmm3, [esp + tsc]
+       mulps xmm3, xmm0
+       subps  xmm4, xmm3
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3310_updateouterdata_coul:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+
+       /* loop back to mno */
+       dec  dword ptr [esp + nscoul]
+       jz  .i3310_testvdw
+       jmp .i3310_mno_coul
+.i3310_testvdw:
+       mov  ecx, [esp + nsvdw]
+       cmp  ecx,  0
+       jnz  .i3310_mno_vdw
+       jmp  .i3310_last_mno
+.i3310_mno_vdw:
+       mov   ebx,  [esp + solnr]
+       inc   dword ptr [esp + solnr]
+
+        mov   edx, [ebp + type] 
+        mov   edx, [edx + ebx*4]
+        imul  edx, [ebp + ntype]
+        shl   edx, 1
+        mov   [esp + ntia], edx
+               
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       movss xmm0, [esp + shX]
+       movss xmm1, [esp + shY]
+       movss xmm2, [esp + shZ]
+
+       addss xmm0, [eax + ebx*4]
+       addss xmm1, [eax + ebx*4 + 4]
+       addss xmm2, [eax + ebx*4 + 8]
+       
+       xorps xmm4, xmm4
+       movaps [esp + fix], xmm4
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm4
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+
+       movaps [esp + ix], xmm0
+       movaps [esp + iy], xmm1
+       movaps [esp + iz], xmm2
+
+       mov   ecx, [esp + innerjjnr0]
+       mov   [esp + innerjjnr], ecx
+       mov   edx, [esp + innerk0]
+        sub   edx,  4
+        mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3310_unroll_vdw_loop
+       jmp   .i3310_finish_vdw_inner
+.i3310_unroll_vdw_loop:        
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+       
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ix-iz to xmm4-xmm6 */
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       movhlps xmm5, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm5     /* mm6/mm7 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       cvtpi2ps xmm5, mm7
+       movlhps xmm6, xmm5
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+       pslld mm6, 2
+       pslld mm7, 2
+
+       movd mm0, eax   
+       movd mm1, ebx
+       movd mm2, ecx
+       movd mm3, edx
+
+       mov  esi, [ebp + VFtab]
+       movd eax, mm6
+       psrlq mm6, 32
+       movd ecx, mm7
+       psrlq mm7, 32
+       movd ebx, mm6
+       movd edx, mm7
+
+       lea   eax, [eax + eax*2] 
+       lea   ebx, [ebx + ebx*2] 
+       lea   ecx, [ecx + ecx*2] 
+       lea   edx, [edx + edx*2] 
+
+       /* dispersion */
+       movlps xmm5, [esi + eax*4 + 0]
+       movlps xmm7, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + ebx*4 + 0]
+       movhps xmm7, [esi + edx*4 + 0] /* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+       
+       movlps xmm7, [esi + eax*4 + 8]
+       movlps xmm3, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + ebx*4 + 8]
+       movhps xmm3, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + eax*4 + 16]
+       movlps xmm7, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + ebx*4 + 16]
+       movhps xmm7, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + eax*4 + 24]
+       movlps xmm3, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + ebx*4 + 24]
+       movhps xmm3, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       movd eax, mm0   
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+
+       mov    edi, [ebp + faction]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* the fj's - start by accumulating x & y forces from memory */
+       movlps xmm4, [edi + eax*4]
+       movlps xmm6, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm6, [edi + edx*4]
+
+       movaps xmm3, xmm4
+       shufps xmm3, xmm6, 0b10001000
+       shufps xmm4, xmm6, 0b11011101                         
+
+       /* now xmm3-xmm5 contains fjx, fjy, fjz */
+       subps  xmm3, xmm0
+       subps  xmm4, xmm1
+       
+       /* unpack them back so we can store them - first x & y in xmm3/xmm4 */
+
+       movaps xmm6, xmm3
+       unpcklps xmm6, xmm4
+       unpckhps xmm3, xmm4     
+       /* xmm6(l)=x & y for j1, (h) for j2 */
+       /* xmm3(l)=x & y for j3, (h) for j4 */
+       movlps [edi + eax*4], xmm6
+       movlps [edi + ecx*4], xmm3
+       
+       movhps [edi + ebx*4], xmm6
+       movhps [edi + edx*4], xmm3
+
+       /* and the z forces */
+       movss  xmm4, [edi + eax*4 + 8]
+       movss  xmm5, [edi + ebx*4 + 8]
+       movss  xmm6, [edi + ecx*4 + 8]
+       movss  xmm7, [edi + edx*4 + 8]
+       subss  xmm4, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm5, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm6, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm7, xmm2
+       movss  [edi + eax*4 + 8], xmm4
+       movss  [edi + ebx*4 + 8], xmm5
+       movss  [edi + ecx*4 + 8], xmm6
+       movss  [edi + edx*4 + 8], xmm7
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3310_finish_vdw_inner
+       jmp   .i3310_unroll_vdw_loop
+.i3310_finish_vdw_inner:
+       /* check if at least two particles remain */
+       add   [esp + innerk],  4
+       mov   edx, [esp + innerk]
+       and   edx, 2
+       jnz   .i3310_dopair_vdw
+       jmp   .i3310_checksingle_vdw
+.i3310_dopair_vdw:     
+        mov   ecx, [esp + innerjjnr]
+       
+       mov   eax, [ecx]        
+       mov   ebx, [ecx + 4]              
+       add   [esp + innerjjnr],  8     
+       xorps xmm7, xmm7
+
+       mov esi, [ebp + type]
+       mov   ecx, eax
+       mov   edx, ebx
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add ecx, edi
+       add edx, edi
+       movlps xmm6, [esi + ecx*4]
+       movhps xmm6, [esi + edx*4]
+       mov edi, [ebp + pos]    
+       
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b1000       
+       shufps xmm6, xmm6, 0b1101
+       movlhps xmm4, xmm7
+       movlhps xmm6, xmm7
+       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+                       
+       lea   eax, [eax + eax*2]
+       lea   ebx, [ebx + ebx*2]
+       /* move coordinates to xmm0-xmm2 */
+       movlps xmm1, [edi + eax*4]
+       movss xmm2, [edi + eax*4 + 8]   
+       movhps xmm1, [edi + ebx*4]
+       movss xmm0, [edi + ebx*4 + 8]   
+
+       movlhps xmm3, xmm7
+       
+       shufps xmm2, xmm0, 0
+       
+       movaps xmm0, xmm1
+
+       shufps xmm2, xmm2, 0b10001000
+       
+       shufps xmm0, xmm0, 0b10001000
+       shufps xmm1, xmm1, 0b11011101
+                       
+       mov    edi, [ebp + faction]
+       /* move ix-iz to xmm4-xmm6 */
+       xorps   xmm7, xmm7
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ecx, mm6
+       psrlq mm6, 32
+       movd edx, mm6
+
+       lea   ecx, [ecx + ecx*2] 
+       lea   edx, [edx + edx*2] 
+
+       /* dispersion */
+       movlps xmm5, [esi + ecx*4 + 0]
+       movhps xmm5, [esi + edx*4 + 0]/* got half dispersion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm4, 0b10001000
+       shufps xmm5, xmm5, 0b11011101
+       
+       movlps xmm7, [esi + ecx*4 + 8]
+       movhps xmm7, [esi + edx*4 + 8] /* other half of dispersion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 0b10001000
+       shufps xmm7, xmm7, 0b11011101
+       /* dispersion table ready, in xmm4-xmm7 */      
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm5, [esi + ecx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 16] /* got half repulsion table */
+       movaps xmm4, xmm5
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm5, xmm7, 0b11011101
+
+       movlps xmm7, [esi + ecx*4 + 24]
+       movhps xmm7, [esi + edx*4 + 24] /* other half of repulsion table */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm3, 0b10001000
+       shufps xmm7, xmm3, 0b11011101
+       /* table ready, in xmm4-xmm7 */ 
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update the fj's */
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+
+       shufps  xmm0, xmm0, 0b11100001
+       shufps  xmm1, xmm1, 0b11100001
+       shufps  xmm2, xmm2, 0b11100001
+
+       movss   xmm3, [edi + ebx*4]
+       movss   xmm4, [edi + ebx*4 + 4]
+       movss   xmm5, [edi + ebx*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + ebx*4], xmm3
+       movss   [edi + ebx*4 + 4], xmm4
+       movss   [edi + ebx*4 + 8], xmm5 
+
+.i3310_checksingle_vdw:                                
+       mov   edx, [esp + innerk]
+       and   edx, 1
+       jnz    .i3310_dosingle_vdw
+       jmp    .i3310_updateouterdata_vdw
+.i3310_dosingle_vdw:
+       mov edi, [ebp + pos]
+       mov   ecx, [esp + innerjjnr]
+       mov   eax, [ecx]        
+       xorps  xmm6, xmm6
+
+       mov esi, [ebp + type]
+       mov ecx, eax
+       mov ecx, [esi + ecx*4]  
+       mov esi, [ebp + nbfp]
+       shl ecx, 1
+       add ecx, [esp + ntia]
+       movlps xmm6, [esi + ecx*4]
+       movaps xmm4, xmm6
+       shufps xmm4, xmm4, 0b11111100   
+       shufps xmm6, xmm6, 0b11111101   
+                       
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6        
+               
+       lea   eax, [eax + eax*2]
+       
+       /* move coordinates to xmm0-xmm2 */
+       movss xmm0, [edi + eax*4]       
+       movss xmm1, [edi + eax*4 + 4]   
+       movss xmm2, [edi + eax*4 + 8]    
+       
+       movaps xmm4, [esp + ix]
+       movaps xmm5, [esp + iy]
+       movaps xmm6, [esp + iz]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dx], xmm4
+       movaps [esp + dy], xmm5
+       movaps [esp + dz], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+
+       mulps xmm4, xmm0        /* xmm4=r */
+       mulps xmm4, [esp + tsc]
+
+       cvttps2pi mm6, xmm4     /* mm6 contain lu indices */
+       cvtpi2ps xmm6, mm6
+       subps xmm4, xmm6        
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1       
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+
+       pslld mm6, 2
+
+       mov  esi, [ebp + VFtab]
+       movd ebx, mm6
+
+       lea   ebx, [ebx + ebx*2]        
+
+       /* dispersion */
+       movlps xmm4, [esi + ebx*4 + 0]
+       movlps xmm6, [esi + ebx*4 + 8]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+
+       movaps xmm4, [esp + c6]
+       mulps  xmm7, xmm4        /* fijD */
+       mulps  xmm5, xmm4        /* vnb6 */
+
+       /* put scalar force on stack Update vnbtot directly */
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + fscal], xmm7
+       movaps [esp + vnbtot], xmm5
+
+       /* repulsion */
+       movlps xmm4, [esi + ebx*4 + 16]
+       movlps xmm6, [esi + ebx*4 + 24]
+       movaps xmm5, xmm4
+       movaps xmm7, xmm6
+       shufps xmm5, xmm5, 1
+       shufps xmm7, xmm7, 1
+       /* table ready in xmm4-xmm7 */
+       
+       mulps  xmm6, xmm1       /* xmm6=Geps */
+       mulps  xmm7, xmm2       /* xmm7=Heps2 */
+       addps  xmm5, xmm6
+       addps  xmm5, xmm7       /* xmm5=Fp */   
+       mulps  xmm7, [esp + two]        /* two*Heps2 */
+       addps  xmm7, xmm6
+       addps  xmm7, xmm5 /* xmm7=FF */
+       mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+       addps  xmm5, xmm4 /* xmm5=VV */
+       
+       movaps xmm4, [esp + c12]
+       mulps  xmm7, xmm4 /* fijR */
+       mulps  xmm5, xmm4 /* vnb12 */
+       addps  xmm7, [esp + fscal] 
+       
+       addps  xmm5, [esp + vnbtot]
+       movaps [esp + vnbtot], xmm5
+       xorps  xmm4, xmm4
+
+       mulps xmm7, [esp + tsc]
+       mulps xmm7, xmm0
+       subps  xmm4, xmm7
+       mov    edi, [ebp + faction]
+
+       movaps xmm0, [esp + dx]
+       movaps xmm1, [esp + dy]
+       movaps xmm2, [esp + dz]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+       /* xmm0-xmm2 contains tx-tz (partial force) */
+       /* now update f_i */
+       movaps xmm3, [esp + fix]
+       movaps xmm4, [esp + fiy]
+       movaps xmm5, [esp + fiz]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm5, xmm2
+       movaps [esp + fix], xmm3
+       movaps [esp + fiy], xmm4
+       movaps [esp + fiz], xmm5
+       /* update fj */
+       
+       movss   xmm3, [edi + eax*4]
+       movss   xmm4, [edi + eax*4 + 4]
+       movss   xmm5, [edi + eax*4 + 8]
+       subss   xmm3, xmm0
+       subss   xmm4, xmm1
+       subss   xmm5, xmm2      
+       movss   [edi + eax*4], xmm3
+       movss   [edi + eax*4 + 4], xmm4
+       movss   [edi + eax*4 + 8], xmm5 
+.i3310_updateouterdata_vdw:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fix]
+       movaps xmm1, [esp + fiy]
+       movaps xmm2, [esp + fiz]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* increment fshift force */ 
+       movss  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 4]
+       movss  xmm5, [esi + edx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esi + edx*4],     xmm3
+       movss  [esi + edx*4 + 4], xmm4
+       movss  [esi + edx*4 + 8], xmm5
+       
+       /* loop back to mno */
+       dec dword ptr [esp + nsvdw]
+       jz  .i3310_last_mno
+       jmp .i3310_mno_vdw
+.i3310_last_mno:       
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3310_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3310_outer
+.i3310_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 380
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
+
+
+.globl inl3320_sse
+       .type inl3320_sse,@function
+inl3320_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56                      
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72      
+.equ           tabscale,       76      
+.equ           VFtab,          80      
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           iqO,            144 
+.equ           iqH,            160 
+.equ           dxO,            176
+.equ           dyO,            192
+.equ           dzO,            208     
+.equ           dxH1,           224
+.equ           dyH1,           240
+.equ           dzH1,           256     
+.equ           dxH2,           272
+.equ           dyH2,           288
+.equ           dzH2,           304     
+.equ           qqO,            320
+.equ           qqH,            336
+.equ           rinvO,          352
+.equ           rinvH1,         368
+.equ           rinvH2,         384             
+.equ           rO,             400
+.equ           rH1,            416
+.equ           rH2,            432
+.equ           tsc,            448     
+.equ           two,            464
+.equ           c6,             480
+.equ           c12,            496
+.equ           vctot,          512
+.equ           vnbtot,         528
+.equ           fixO,           544
+.equ           fiyO,           560
+.equ           fizO,           576
+.equ           fixH1,          592
+.equ           fiyH1,          608
+.equ           fizH1,          624
+.equ           fixH2,          640
+.equ           fiyH2,          656
+.equ           fizH2,          672
+.equ           fjx,            688
+.equ           fjy,            704
+.equ           fjz,            720
+.equ           half,           736
+.equ           three,          752
+.equ           is3,            768
+.equ           ii3,            772
+.equ           ntia,           776     
+.equ           innerjjnr,      780
+.equ           innerk,         784
+.equ           salign,         788                                                             
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 792            /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp +tabscale]
+       
+       movaps [esp + half],  xmm0
+       movaps [esp + two],  xmm1
+       movaps [esp + three],  xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc], xmm3
+       
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, [edx + ebx*4 + 4]   
+       movss xmm5, [ebp + facel]
+       mulss  xmm3, xmm5
+       mulss  xmm4, xmm5
+
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       movaps [esp + iqO], xmm3
+       movaps [esp + iqH], xmm4
+       
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       mov   [esp + ntia], ecx         
+.i3320_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx
+
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+       
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3320_unroll_loop
+       jmp   .i3320_odd_inner
+.i3320_unroll_loop:
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4]              
+       mov   ecx, [edx + 8]            
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + charge]        /* base of charge[] */
+       
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + ecx*4]
+       movss xmm6, [esi + ebx*4]
+       movss xmm7, [esi + edx*4]
+
+       shufps xmm3, xmm6, 0 
+       shufps xmm4, xmm7, 0 
+       shufps xmm3, xmm4, 0b10001000 /* all charges in xmm3 */ 
+       movaps xmm4, xmm3            /* and in xmm4 */
+       mulps  xmm3, [esp + iqO]
+       mulps  xmm4, [esp + iqH]
+
+       movd  mm0, eax          /* use mmx registers as temp storage */
+       movd  mm1, ebx
+       movd  mm2, ecx
+       movd  mm3, edx
+
+       movaps  [esp + qqO], xmm3
+       movaps  [esp + qqH], xmm4
+       
+       mov esi, [ebp + type]
+       mov eax, [esi + eax*4]
+       mov ebx, [esi + ebx*4]
+       mov ecx, [esi + ecx*4]
+       mov edx, [esi + edx*4]
+       mov esi, [ebp + nbfp]
+       shl eax, 1      
+       shl ebx, 1      
+       shl ecx, 1      
+       shl edx, 1      
+       mov edi, [esp + ntia]
+       add eax, edi
+       add ebx, edi
+       add ecx, edi
+       add edx, edi
+
+       movlps xmm6, [esi + eax*4]
+       movlps xmm7, [esi + ecx*4]
+       movhps xmm6, [esi + ebx*4]
+       movhps xmm7, [esi + edx*4]
+
+       movaps xmm4, xmm6
+       shufps xmm4, xmm7, 0b10001000
+       shufps xmm6, xmm7, 0b11011101
+       
+       movd  eax, mm0          
+       movd  ebx, mm1
+       movd  ecx, mm2
+       movd  edx, mm3
+
+       movaps [esp + c6], xmm4
+       movaps [esp + c12], xmm6
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+
+       /* move four coordinates to xmm0-xmm2 */        
+       movlps xmm4, [esi + eax*4]
+       movlps xmm5, [esi + ecx*4]
+       movss xmm2, [esi + eax*4 + 8]
+       movss xmm6, [esi + ecx*4 + 8]
+
+       movhps xmm4, [esi + ebx*4]
+       movhps xmm5, [esi + edx*4]
+
+       movss xmm0, [esi + ebx*4 + 8]
+       movss xmm1, [esi + edx*4 + 8]
+
+       shufps xmm2, xmm0, 0
+       shufps xmm6, xmm1, 0
+       
+       movaps xmm0, xmm4
+       movaps xmm1, xmm4
+
+       shufps xmm2, xmm6, 0b10001000
+       
+       shufps xmm0, xmm5, 0b10001000
+       shufps xmm1, xmm5, 0b11011101           
+
+       /* move ixO-izO to xmm4-xmm6 */
+       movaps xmm4, [esp + ixO]
+       movaps xmm5, [esp + iyO]
+       movaps xmm6, [esp + izO]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxO], xmm4
+       movaps [esp + dyO], xmm5
+       movaps [esp + dzO], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm4, xmm5
+       addps xmm4, xmm6
+       movaps xmm7, xmm4
+       /* rsqO in xmm7 */
+
+       /* move ixH1-izH1 to xmm4-xmm6 */
+       movaps xmm4, [esp + ixH1]
+       movaps xmm5, [esp + iyH1]
+       movaps xmm6, [esp + izH1]
+
+       /* calc dr */
+       subps xmm4, xmm0
+       subps xmm5, xmm1
+       subps xmm6, xmm2
+
+       /* store dr */
+       movaps [esp + dxH1], xmm4
+       movaps [esp + dyH1], xmm5
+       movaps [esp + dzH1], xmm6
+       /* square it */
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       mulps xmm6,xmm6
+       addps xmm6, xmm5
+       addps xmm6, xmm4
+       /* rsqH1 in xmm6 */
+
+       /* move ixH2-izH2 to xmm3-xmm5 */ 
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+
+       /* calc dr */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+
+       /* store dr */
+       movaps [esp + dxH2], xmm3
+       movaps [esp + dyH2], xmm4
+       movaps [esp + dzH2], xmm5
+       /* square it */
+       mulps xmm3,xmm3
+       mulps xmm4,xmm4
+       mulps xmm5,xmm5
+       addps xmm5, xmm4
+       addps xmm5, xmm3
+       /* rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 */
+
+       /* start with rsqO - seed to xmm2 */    
+       rsqrtps xmm2, xmm7
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm7      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvO], xmm4     /* rinvO in xmm4 */
+       mulps   xmm7, xmm4
+       movaps  [esp + rO], xmm7        
+
+       /* rsqH1 - seed in xmm2 */
+       rsqrtps xmm2, xmm6
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm6      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvH1], xmm4    /* rinvH1 in xmm4 */
+       mulps   xmm6, xmm4
+       movaps  [esp + rH1], xmm6
+
+       /* rsqH2 - seed to xmm2 */
+       rsqrtps xmm2, xmm5
+       movaps  xmm3, xmm2
+       mulps   xmm2, xmm2
+       movaps  xmm4, [esp + three]
+       mulps   xmm2, xmm5      /* rsq*lu*lu */
+       subps   xmm4, xmm2      /* 30-rsq*lu*lu */
+       mulps   xmm4, xmm3      /* lu*(3-rsq*lu*lu) */
+       mulps   xmm4, [esp + half]
+       movaps  [esp + rinvH2], xmm4    /* rinvH2 in xmm4 */
+       mulps   xmm5, xmm4
+       movaps  [esp + rH2], xmm5
+
+       /* do O interactions */
+       /* rO is still in xmm7 */
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd mm0, eax   
+        movd mm1, ebx
+        movd mm2, ecx
+        movd mm3, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm0, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5 
+
+        /* dispersion */
+        movlps xmm5, [esi + eax*4 + 16]
+        movlps xmm7, [esi + ecx*4 + 16]
+        movhps xmm5, [esi + ebx*4 + 16]
+        movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+        
+        movlps xmm7, [esi + eax*4 + 24]
+        movlps xmm3, [esi + ecx*4 + 24]
+        movhps xmm7, [esi + ebx*4 + 24]
+        movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* dispersion table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+
+        movaps xmm4, [esp + c6]
+        mulps  xmm7, xmm4        /* fijD */
+        mulps  xmm5, xmm4        /* vnb6 */
+        addps  xmm0, xmm7 /* add to fscal */
+
+        /* Update vnbtot directly */
+        addps  xmm5, [esp + vnbtot]
+        movaps [esp + vnbtot], xmm5
+
+        /* repulsion */
+        movlps xmm5, [esi + eax*4 + 32]
+        movlps xmm7, [esi + ecx*4 + 32]
+        movhps xmm5, [esi + ebx*4 + 32]
+        movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 40]
+        movlps xmm3, [esi + ecx*4 + 40]
+        movhps xmm7, [esi + ebx*4 + 40]
+        movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* repulsion table ready, in xmm4-xmm7 */      
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+
+        movaps xmm4, [esp + c12]
+        mulps  xmm7, xmm4        /* fijD */
+        mulps  xmm5, xmm4        /* vnb12 */
+        addps  xmm7, xmm0 /* add to fscal */
+        addps  xmm5, [esp + vnbtot] /* total nonbonded potential in xmm5 */
+       xorps xmm4, xmm4
+       
+       mulps  xmm7, [esp + rinvO] /* total fscal now in xmm7 */
+
+       mulps  xmm7, [esp + tsc]
+        movaps [esp + vnbtot], xmm5
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4       /* tx in xmm0-xmm2 */
+
+       /* update O forces */
+       movaps xmm3, [esp + fixO]
+       movaps xmm4, [esp + fiyO]
+       movaps xmm7, [esp + fizO]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixO], xmm3
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm7
+       /* update j forces with water O */
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* Done with O interactions - now H1! */
+       movaps xmm7, [esp + rH1]
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm7, xmm0 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm7 fijC */
+        /* increment vcoul */
+       xorps  xmm4, xmm4
+        addps  xmm5, [esp + vctot]
+       mulps  xmm7, [esp + rinvH1]
+        movaps [esp + vctot], xmm5 
+       mulps  xmm7, [esp + tsc]
+       subps xmm4, xmm7
+
+       movaps xmm0, [esp + dxH1]
+       movaps xmm1, [esp + dyH1]
+       movaps xmm2, [esp + dzH1]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+       /* update H1 forces */
+       movaps xmm3, [esp + fixH1]
+       movaps xmm4, [esp + fiyH1]
+       movaps xmm7, [esp + fizH1]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH1], xmm3
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm7
+       /* update j forces with water H1 */
+       addps  xmm0, [esp + fjx]
+       addps  xmm1, [esp + fjy]
+       addps  xmm2, [esp + fjz]
+       movaps [esp + fjx], xmm0
+       movaps [esp + fjy], xmm1
+       movaps [esp + fjz], xmm2
+
+       /* Done with H1, finally we do H2 interactions */
+       movaps xmm7, [esp + rH2]
+       mulps   xmm7, [esp + tsc]
+       movhlps xmm4, xmm7
+       cvttps2pi mm6, xmm7
+       cvttps2pi mm7, xmm4    /* mm6/mm7 contain lu indices */
+       
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm4, mm7
+        movlhps xmm3, xmm4
+       
+        subps xmm7, xmm3
+       movaps xmm1, xmm7       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+               
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm7, xmm0 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul */
+       xorps  xmm4, xmm4
+        addps  xmm5, [esp + vctot]
+       mulps  xmm7, [esp + rinvH2]
+        movaps [esp + vctot], xmm5 
+       mulps  xmm7, [esp + tsc]
+       subps  xmm4, xmm7
+
+       movaps xmm0, [esp + dxH2]
+       movaps xmm1, [esp + dyH2]
+       movaps xmm2, [esp + dzH2]
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4
+
+        movd eax, mm0   
+        movd ebx, mm1
+        movd ecx, mm2
+        movd edx, mm3
+       
+       /* update H2 forces */
+       movaps xmm3, [esp + fixH2]
+       movaps xmm4, [esp + fiyH2]
+       movaps xmm7, [esp + fizH2]
+       addps  xmm3, xmm0
+       addps  xmm4, xmm1
+       addps  xmm7, xmm2
+       movaps [esp + fixH2], xmm3
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm7
+
+       mov edi, [ebp +faction]
+       /* update j forces */
+       addps xmm0, [esp + fjx]
+       addps xmm1, [esp + fjy]
+       addps xmm2, [esp + fjz]
+
+       movlps xmm4, [edi + eax*4]
+       movlps xmm7, [edi + ecx*4]
+       movhps xmm4, [edi + ebx*4]
+       movhps xmm7, [edi + edx*4]
+       
+       movaps xmm3, xmm4
+       shufps xmm3, xmm7, 0b10001000
+       shufps xmm4, xmm7, 0b11011101                         
+       /* xmm3 has fjx, xmm4 has fjy */
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       /* unpack them back for storing */
+       movaps xmm7, xmm3
+       unpcklps xmm7, xmm4
+       unpckhps xmm3, xmm4     
+       movlps [edi + eax*4], xmm7
+       movlps [edi + ecx*4], xmm3
+       movhps [edi + ebx*4], xmm7
+       movhps [edi + edx*4], xmm3
+       /* finally z forces */
+       movss  xmm0, [edi + eax*4 + 8]
+       movss  xmm1, [edi + ebx*4 + 8]
+       movss  xmm3, [edi + ecx*4 + 8]
+       movss  xmm4, [edi + edx*4 + 8]
+       subss  xmm0, xmm2
+       shufps xmm2, xmm2, 0b11100101
+       subss  xmm1, xmm2
+       shufps xmm2, xmm2, 0b11101010
+       subss  xmm3, xmm2
+       shufps xmm2, xmm2, 0b11111111
+       subss  xmm4, xmm2
+       movss  [edi + eax*4 + 8], xmm0
+       movss  [edi + ebx*4 + 8], xmm1
+       movss  [edi + ecx*4 + 8], xmm3
+       movss  [edi + edx*4 + 8], xmm4
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3320_odd_inner
+       jmp   .i3320_unroll_loop
+.i3320_odd_inner:      
+       add   [esp + innerk],  4
+       jnz   .i3320_odd_loop
+       jmp   .i3320_updateouterdata
+.i3320_odd_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       xorps xmm4, xmm4
+       movss xmm4, [esp + iqO]
+       mov esi, [ebp + charge] 
+       movhps xmm4, [esp + iqH]     
+       movss xmm3, [esi + eax*4]       /* charge in xmm3 */
+       shufps xmm3, xmm3, 0
+       mulps xmm3, xmm4
+       movaps [esp + qqO], xmm3        /* use oxygen qq for storage */
+
+       xorps xmm6, xmm6
+       mov esi, [ebp + type]
+       mov ebx, [esi + eax*4]
+       mov esi, [ebp + nbfp]
+       shl ebx, 1      
+       add ebx, [esp + ntia]
+       movlps xmm6, [esi + ebx*4]
+       movaps xmm7, xmm6
+       shufps xmm6, xmm6, 0b11111100
+       shufps xmm7, xmm7, 0b11111101
+       movaps [esp + c6], xmm6
+       movaps [esp + c12], xmm7
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+       
+       /* move j coords to xmm0-xmm2 */
+       movss xmm0, [esi + eax*4]
+       movss xmm1, [esi + eax*4 + 4]
+       movss xmm2, [esi + eax*4 + 8]
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       
+       movss xmm3, [esp + ixO]
+       movss xmm4, [esp + iyO]
+       movss xmm5, [esp + izO]
+               
+       movlps xmm6, [esp + ixH1]
+       movlps xmm7, [esp + ixH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm3, xmm6
+       movlps xmm6, [esp + iyH1]
+       movlps xmm7, [esp + iyH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm4, xmm6
+       movlps xmm6, [esp + izH1]
+       movlps xmm7, [esp + izH2]
+       unpcklps xmm6, xmm7
+       movlhps xmm5, xmm6
+
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       
+       movaps [esp + dxO], xmm3
+       movaps [esp + dyO], xmm4
+       movaps [esp + dzO], xmm5
+
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       /* rsq in xmm4 */
+
+       rsqrtps xmm5, xmm4
+       /* lookup seed in xmm5 */
+       movaps xmm2, xmm5
+       mulps xmm5, xmm5
+       movaps xmm1, [esp + three]
+       mulps xmm5, xmm4        /* rsq*lu*lu */                 
+       movaps xmm0, [esp + half]
+       subps xmm1, xmm5        /* 30-rsq*lu*lu */
+       mulps xmm1, xmm2        
+       mulps xmm0, xmm1        /* xmm0=rinv */
+       mulps xmm4, xmm0        /* xmm4=r */
+       movaps [esp + rinvO], xmm0
+       
+       mulps xmm4, [esp + tsc]
+       movhlps xmm7, xmm4
+       cvttps2pi mm6, xmm4
+       cvttps2pi mm7, xmm7    /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm7, mm7
+        movlhps xmm3, xmm7
+
+       subps   xmm4, xmm3      
+       movaps xmm1, xmm4       /* xmm1=eps */
+       movaps xmm2, xmm1
+       mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+       
+        movd mm0, eax   
+        movd mm1, ecx
+        movd mm2, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+       
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */     
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */       
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm0, [esp + qqO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm0 /* vcoul=qq*VV */ 
+        mulps  xmm0, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and xmm0 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       
+        /* dispersion */
+        movlps xmm5, [esi + eax*4 + 16]        /* half table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm4, 0b11111100
+        shufps xmm5, xmm5, 0b11111101
+        
+        movlps xmm7, [esi + eax*4 + 24] /* other half of dispersion table */
+        movaps xmm6, xmm7
+        shufps xmm6, xmm6, 0b11111100
+        shufps xmm7, xmm7, 0b11111101
+        /* dispersion table ready, in xmm4-xmm7 */ 
+        mulss  xmm6, xmm1       /* xmm6=Geps */
+        mulss  xmm7, xmm2       /* xmm7=Heps2 */
+        addss  xmm5, xmm6      /* Update vnbtot directly */
+        addss  xmm5, xmm7       /* xmm5=Fp */       
+        mulss  xmm7, [esp + two]       /* two*Heps2 */
+        addss  xmm7, xmm6
+        addss  xmm7, xmm5 /* xmm7=FF */
+        mulss  xmm5, xmm1 /* xmm5=eps*Fp */
+        addss  xmm5, xmm4 /* xmm5=VV */
+
+        movaps xmm4, [esp + c6]
+        mulps  xmm7, xmm4        /* fijD */
+        mulps  xmm5, xmm4        /* vnb6 */
+        addps  xmm0, xmm7 /* add to fscal */
+
+        /* Update vnbtot directly */
+        addps  xmm5, [esp + vnbtot]
+        movaps [esp + vnbtot], xmm5
+
+        /* repulsion */
+        movlps xmm5, [esi + eax*4 + 32] /* got half repulsion table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm4, 0b10001000
+        shufps xmm5, xmm5, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 40] /* other half of repulsion table */
+        movaps xmm6, xmm7
+        shufps xmm6, xmm6, 0b10001000
+        shufps xmm7, xmm7, 0b11011101
+        /* repulsion table ready, in xmm4-xmm7 */      
+        mulss  xmm6, xmm1       /* xmm6=Geps */
+        mulss  xmm7, xmm2       /* xmm7=Heps2 */
+        addss  xmm5, xmm6
+        addss  xmm5, xmm7       /* xmm5=Fp */       
+        mulss  xmm7, [esp + two]       /* two*Heps2 */
+        addss  xmm7, xmm6
+        addss  xmm7, xmm5 /* xmm7=FF */
+        mulss  xmm5, xmm1 /* xmm5=eps*Fp */
+        addss  xmm5, xmm4 /* xmm5=VV */
+
+        movaps xmm4, [esp + c12]
+        mulps  xmm7, xmm4        /* fijD */
+        mulps  xmm5, xmm4        /* vnb12 */
+        addps  xmm7, xmm0 /* add to fscal */
+        addps  xmm5, [esp + vnbtot] /* total nonbonded potential in xmm5 */
+
+       xorps  xmm4, xmm4
+        movd eax, mm0   
+        movd ecx, mm1
+        movd edx, mm2  
+               
+       mulps  xmm7, [esp + rinvO] /* total fscal now in xmm7 */
+        movaps [esp + vnbtot], xmm5
+       mulps  xmm7, [esp + tsc]
+       subps xmm4, xmm7
+
+       movaps xmm0, [esp + dxO]
+       movaps xmm1, [esp + dyO]
+       movaps xmm2, [esp + dzO]
+
+       mulps  xmm0, xmm4
+       mulps  xmm1, xmm4
+       mulps  xmm2, xmm4 /* xmm0-xmm2 now contains tx-tz (partial force) */
+       movss  xmm3, [esp + fixO]       
+       movss  xmm4, [esp + fiyO]       
+       movss  xmm5, [esp + fizO]       
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [esp + fixO], xmm3       
+       movss  [esp + fiyO], xmm4       
+       movss  [esp + fizO], xmm5       /* updated the O force now do the H's */
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       shufps xmm3, xmm3, 0b11100110   /* shift right */
+       shufps xmm4, xmm4, 0b11100110
+       shufps xmm5, xmm5, 0b11100110
+       addss  xmm3, [esp + fixH1]
+       addss  xmm4, [esp + fiyH1]
+       addss  xmm5, [esp + fizH1]
+       movss  [esp + fixH1], xmm3      
+       movss  [esp + fiyH1], xmm4      
+       movss  [esp + fizH1], xmm5      /* updated the H1 force */
+
+       mov edi, [ebp + faction]
+       shufps xmm3, xmm3, 0b11100111   /* shift right */
+       shufps xmm4, xmm4, 0b11100111
+       shufps xmm5, xmm5, 0b11100111
+       addss  xmm3, [esp + fixH2]
+       addss  xmm4, [esp + fiyH2]
+       addss  xmm5, [esp + fizH2]
+       movss  [esp + fixH2], xmm3      
+       movss  [esp + fiyH2], xmm4      
+       movss  [esp + fizH2], xmm5      /* updated the H2 force */
+
+       /* the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1 */
+       xorps  xmm5, xmm5
+       movaps xmm3, xmm0
+       movlps xmm6, [edi + eax*4]
+       movss  xmm7, [edi + eax*4 + 8]
+       unpcklps xmm3, xmm1
+       movlhps  xmm3, xmm5     
+       unpckhps xmm0, xmm1             
+       addps    xmm0, xmm3
+       movhlps  xmm3, xmm0     
+       addps    xmm0, xmm3     /* x,y sum in xmm0 */
+
+       movhlps  xmm1, xmm2
+       addss    xmm2, xmm1
+       shufps   xmm1, xmm1, 1 
+       addss    xmm2, xmm1    /* z sum in xmm2 */
+       subps    xmm6, xmm0
+       subss    xmm7, xmm2
+       
+       movlps [edi + eax*4],     xmm6
+       movss  [edi + eax*4 + 8], xmm7
+
+       dec dword ptr [esp + innerk]
+       jz    .i3320_updateouterdata
+       jmp   .i3320_odd_loop
+.i3320_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO]
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       mov   edx, [ebp + gid]  
+       mov   edx, [edx]
+       add   [ebp + gid],  4   
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+        
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3320_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3320_outer
+.i3320_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 792
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+       
+
+       
+.globl inl3330_sse
+       .type inl3330_sse,@function
+inl3330_sse:   
+.equ           nri,            8
+.equ           iinr,           12
+.equ           jindex,         16
+.equ           jjnr,           20
+.equ           shift,          24
+.equ           shiftvec,       28
+.equ           fshift,         32
+.equ           gid,            36
+.equ           pos,            40              
+.equ           faction,        44
+.equ           charge,         48
+.equ           facel,          52
+.equ           Vc,             56
+.equ           type,           60
+.equ           ntype,          64
+.equ           nbfp,           68      
+.equ           Vnb,            72
+.equ           tabscale,       76
+.equ           VFtab,          80
+       /* stack offsets for local variables */ 
+       /* bottom of stack is cache-aligned for sse use */
+.equ           ixO,              0
+.equ           iyO,             16
+.equ           izO,             32
+.equ           ixH1,            48
+.equ           iyH1,            64
+.equ           izH1,            80
+.equ           ixH2,            96
+.equ           iyH2,           112
+.equ           izH2,           128
+.equ           jxO,            144
+.equ           jyO,            160
+.equ           jzO,            176
+.equ           jxH1,           192
+.equ           jyH1,           208
+.equ           jzH1,           224
+.equ           jxH2,           240
+.equ           jyH2,           256
+.equ           jzH2,           272
+.equ           dxOO,           288
+.equ           dyOO,           304
+.equ           dzOO,           320     
+.equ           dxOH1,          336
+.equ           dyOH1,          352
+.equ           dzOH1,          368     
+.equ           dxOH2,          384
+.equ           dyOH2,          400
+.equ           dzOH2,          416     
+.equ           dxH1O,          432
+.equ           dyH1O,          448
+.equ           dzH1O,          464     
+.equ           dxH1H1,         480
+.equ           dyH1H1,         496
+.equ           dzH1H1,         512     
+.equ           dxH1H2,         528
+.equ           dyH1H2,         544
+.equ           dzH1H2,         560     
+.equ           dxH2O,          576
+.equ           dyH2O,          592
+.equ           dzH2O,          608     
+.equ           dxH2H1,         624
+.equ           dyH2H1,         640
+.equ           dzH2H1,         656     
+.equ           dxH2H2,         672
+.equ           dyH2H2,         688
+.equ           dzH2H2,         704
+.equ           qqOO,           720
+.equ           qqOH,           736
+.equ           qqHH,           752
+.equ           two,            768
+.equ           tsc,            784
+.equ           c6,             800
+.equ           c12,            816              
+.equ           vctot,          832
+.equ           vnbtot,         848
+.equ           fixO,           864
+.equ           fiyO,           880
+.equ           fizO,           896
+.equ           fixH1,          912
+.equ           fiyH1,          928
+.equ           fizH1,          944
+.equ           fixH2,          960
+.equ           fiyH2,          976
+.equ           fizH2,          992
+.equ           fjxO,          1008
+.equ           fjyO,          1024
+.equ           fjzO,          1040
+.equ           fjxH1,         1056
+.equ           fjyH1,         1072
+.equ           fjzH1,         1088
+.equ           fjxH2,         1104
+.equ           fjyH2,         1120
+.equ           fjzH2,         1136
+.equ           half,          1152
+.equ           three,         1168
+.equ           rsqOO,         1184
+.equ           rsqOH1,        1200
+.equ           rsqOH2,        1216
+.equ           rsqH1O,        1232
+.equ           rsqH1H1,       1248
+.equ           rsqH1H2,       1264
+.equ           rsqH2O,        1280
+.equ           rsqH2H1,       1296
+.equ           rsqH2H2,       1312
+.equ           rinvOO,        1328
+.equ           rinvOH1,       1344
+.equ           rinvOH2,       1360
+.equ           rinvH1O,       1376
+.equ           rinvH1H1,      1392
+.equ           rinvH1H2,      1408
+.equ           rinvH2O,       1424
+.equ           rinvH2H1,      1440
+.equ           rinvH2H2,      1456
+.equ           fstmp,         1472     
+.equ           is3,           1488
+.equ           ii3,           1492
+.equ           innerjjnr,     1496
+.equ           innerk,        1500
+.equ           salign,        1504                                                     
+       push ebp
+       mov ebp,esp     
+        push eax
+        push ebx
+        push ecx
+        push edx
+       push esi
+       push edi
+       sub esp, 1508           /* local stack space */
+       mov  eax, esp
+       and  eax, 0xf
+       sub esp, eax
+       mov [esp + salign], eax
+
+       emms
+
+       movups xmm0, [sse_half]
+       movups xmm1, [sse_two]
+       movups xmm2, [sse_three]
+       movss xmm3, [ebp +tabscale]
+       movaps [esp + half],  xmm0
+       movaps [esp + two],  xmm1
+       movaps [esp + three], xmm2
+       shufps xmm3, xmm3, 0
+       movaps [esp + tsc],  xmm3
+
+       /* assume we have at least one i particle - start directly */
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       mov   edx, [ebp + charge]
+       movss xmm3, [edx + ebx*4]       
+       movss xmm4, xmm3        
+       movss xmm5, [edx + ebx*4 + 4]   
+       movss xmm6, [ebp + facel]
+       mulss  xmm3, xmm3
+       mulss  xmm4, xmm5
+       mulss  xmm5, xmm5
+       mulss  xmm3, xmm6
+       mulss  xmm4, xmm6
+       mulss  xmm5, xmm6
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + qqOO], xmm3
+       movaps [esp + qqOH], xmm4
+       movaps [esp + qqHH], xmm5
+               
+       xorps xmm0, xmm0
+       mov   edx, [ebp + type]
+       mov   ecx, [edx + ebx*4]
+       shl   ecx, 1
+       mov   edx, ecx
+       imul  ecx, [ebp + ntype]      /* ecx = ntia = 2*ntype*type[ii0] */
+       add   edx, ecx
+       mov   eax, [ebp + nbfp]
+       movlps xmm0, [eax + edx*4] 
+       movaps xmm1, xmm0
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0b01010101
+       movaps [esp + c6], xmm0
+       movaps [esp + c12], xmm1
+
+.i3330_outer:
+       mov   eax, [ebp + shift]      /* eax = pointer into shift[] */
+       mov   ebx, [eax]                /* ebx=shift[n] */
+       add   [ebp + shift],  4  /* advance pointer one step */
+       
+       lea   ebx, [ebx + ebx*2]        /* ebx=3*is */
+       mov   [esp + is3],ebx           /* store is3 */
+
+       mov   eax, [ebp + shiftvec]   /* eax = base of shiftvec[] */
+
+       movss xmm0, [eax + ebx*4]
+       movss xmm1, [eax + ebx*4 + 4]
+       movss xmm2, [eax + ebx*4 + 8] 
+
+       mov   ecx, [ebp + iinr]       /* ecx = pointer into iinr[] */   
+       add   [ebp + iinr],  4   /* advance pointer */
+       mov   ebx, [ecx]                /* ebx =ii */
+
+       lea   ebx, [ebx + ebx*2]        /* ebx = 3*ii=ii3 */
+       mov   eax, [ebp + pos]        /* eax = base of pos[] */ 
+       mov   [esp + ii3], ebx  
+       
+       movaps xmm3, xmm0
+       movaps xmm4, xmm1
+       movaps xmm5, xmm2
+       addss xmm3, [eax + ebx*4]
+       addss xmm4, [eax + ebx*4 + 4]
+       addss xmm5, [eax + ebx*4 + 8]           
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixO], xmm3
+       movaps [esp + iyO], xmm4
+       movaps [esp + izO], xmm5
+
+       movss xmm3, xmm0
+       movss xmm4, xmm1
+       movss xmm5, xmm2
+       addss xmm0, [eax + ebx*4 + 12]
+       addss xmm1, [eax + ebx*4 + 16]
+       addss xmm2, [eax + ebx*4 + 20]          
+       addss xmm3, [eax + ebx*4 + 24]
+       addss xmm4, [eax + ebx*4 + 28]
+       addss xmm5, [eax + ebx*4 + 32]          
+
+       shufps xmm0, xmm0, 0
+       shufps xmm1, xmm1, 0
+       shufps xmm2, xmm2, 0
+       shufps xmm3, xmm3, 0
+       shufps xmm4, xmm4, 0
+       shufps xmm5, xmm5, 0
+       movaps [esp + ixH1], xmm0
+       movaps [esp + iyH1], xmm1
+       movaps [esp + izH1], xmm2
+       movaps [esp + ixH2], xmm3
+       movaps [esp + iyH2], xmm4
+       movaps [esp + izH2], xmm5
+
+       /* clear vctot and i forces */
+       xorps xmm4, xmm4
+       movaps [esp + vctot], xmm4
+       movaps [esp + vnbtot], xmm4
+       movaps [esp + fixO], xmm4
+       movaps [esp + fiyO], xmm4
+       movaps [esp + fizO], xmm4
+       movaps [esp + fixH1], xmm4
+       movaps [esp + fiyH1], xmm4
+       movaps [esp + fizH1], xmm4
+       movaps [esp + fixH2], xmm4
+       movaps [esp + fiyH2], xmm4
+       movaps [esp + fizH2], xmm4
+       
+       mov   eax, [ebp + jindex]
+       mov   ecx, [eax]                 /* jindex[n] */
+       mov   edx, [eax + 4]             /* jindex[n+1] */
+       add   [ebp + jindex],  4
+       sub   edx, ecx                   /* number of innerloop atoms */
+
+       mov   esi, [ebp + pos]
+       mov   edi, [ebp + faction]      
+       mov   eax, [ebp + jjnr]
+       shl   ecx, 2
+       add   eax, ecx
+       mov   [esp + innerjjnr], eax     /* pointer to jjnr[nj0] */
+       sub   edx,  4
+       mov   [esp + innerk], edx        /* number of innerloop atoms */
+       jge   .i3330_unroll_loop
+       jmp   .i3330_single_check
+.i3330_unroll_loop:    
+       /* quad-unroll innerloop here */
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+
+       mov   eax, [edx]        
+       mov   ebx, [edx + 4] 
+       mov   ecx, [edx + 8]
+       mov   edx, [edx + 12]             /* eax-edx=jnr1-4 */
+       
+       add   [esp + innerjjnr],  16 /* advance pointer (unrolled 4) */
+
+       mov esi, [ebp + pos]       /* base of pos[] */
+
+       lea   eax, [eax + eax*2]         /* replace jnr with j3 */
+       lea   ebx, [ebx + ebx*2]        
+       lea   ecx, [ecx + ecx*2]         /* replace jnr with j3 */
+       lea   edx, [edx + edx*2]        
+       
+       /* move j coordinates to local temp variables */
+       movlps xmm2, [esi + eax*4]
+       movlps xmm3, [esi + eax*4 + 12]
+       movlps xmm4, [esi + eax*4 + 24]
+
+       movlps xmm5, [esi + ebx*4]
+       movlps xmm6, [esi + ebx*4 + 12]
+       movlps xmm7, [esi + ebx*4 + 24]
+
+       movhps xmm2, [esi + ecx*4]
+       movhps xmm3, [esi + ecx*4 + 12]
+       movhps xmm4, [esi + ecx*4 + 24]
+
+       movhps xmm5, [esi + edx*4]
+       movhps xmm6, [esi + edx*4 + 12]
+       movhps xmm7, [esi + edx*4 + 24]
+
+       /* current state: */    
+       /* xmm2= jxOa  jyOa  jxOc  jyOc */
+       /* xmm3= jxH1a jyH1a jxH1c jyH1c */
+       /* xmm4= jxH2a jyH2a jxH2c jyH2c */
+       /* xmm5= jxOb  jyOb  jxOd  jyOd */
+       /* xmm6= jxH1b jyH1b jxH1d jyH1d */
+       /* xmm7= jxH2b jyH2b jxH2d jyH2d */
+       
+       movaps xmm0, xmm2
+       movaps xmm1, xmm3
+       unpcklps xmm0, xmm5     /* xmm0= jxOa  jxOb  jyOa  jyOb */
+       unpcklps xmm1, xmm6     /* xmm1= jxH1a jxH1b jyH1a jyH1b */
+       unpckhps xmm2, xmm5     /* xmm2= jxOc  jxOd  jyOc  jyOd */
+       unpckhps xmm3, xmm6     /* xmm3= jxH1c jxH1d jyH1c jyH1d  */
+       movaps xmm5, xmm4
+       movaps   xmm6, xmm0
+       unpcklps xmm4, xmm7     /* xmm4= jxH2a jxH2b jyH2a jyH2b */             
+       unpckhps xmm5, xmm7     /* xmm5= jxH2c jxH2d jyH2c jyH2d */
+       movaps   xmm7, xmm1
+       movlhps  xmm0, xmm2     /* xmm0= jxOa  jxOb  jxOc  jxOd  */
+       movaps [esp + jxO], xmm0
+       movhlps  xmm2, xmm6     /* xmm2= jyOa  jyOb  jyOc  jyOd */
+       movaps [esp + jyO], xmm2
+       movlhps  xmm1, xmm3
+       movaps [esp + jxH1], xmm1
+       movhlps  xmm3, xmm7
+       movaps   xmm6, xmm4
+       movaps [esp + jyH1], xmm3
+       movlhps  xmm4, xmm5
+       movaps [esp + jxH2], xmm4
+       movhlps  xmm5, xmm6
+       movaps [esp + jyH2], xmm5
+
+       movss  xmm0, [esi + eax*4 + 8]
+       movss  xmm1, [esi + eax*4 + 20]
+       movss  xmm2, [esi + eax*4 + 32]
+
+       movss  xmm3, [esi + ecx*4 + 8]
+       movss  xmm4, [esi + ecx*4 + 20]
+       movss  xmm5, [esi + ecx*4 + 32]
+
+       movhps xmm0, [esi + ebx*4 + 4]
+       movhps xmm1, [esi + ebx*4 + 16]
+       movhps xmm2, [esi + ebx*4 + 28]
+       
+       movhps xmm3, [esi + edx*4 + 4]
+       movhps xmm4, [esi + edx*4 + 16]
+       movhps xmm5, [esi + edx*4 + 28]
+       
+       shufps xmm0, xmm3, 0b11001100
+       shufps xmm1, xmm4, 0b11001100
+       shufps xmm2, xmm5, 0b11001100
+       movaps [esp + jzO],  xmm0
+       movaps [esp + jzH1],  xmm1
+       movaps [esp + jzH2],  xmm2
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixO]
+       movaps xmm4, [esp + iyO]
+       movaps xmm5, [esp + izO]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxOH1], xmm3
+       movaps [esp + dyOH1], xmm4
+       movaps [esp + dzOH1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOO], xmm0
+       movaps [esp + rsqOH1], xmm3
+
+       movaps xmm0, [esp + ixO]
+       movaps xmm1, [esp + iyO]
+       movaps xmm2, [esp + izO]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       subps  xmm3, [esp + jxO]
+       subps  xmm4, [esp + jyO]
+       subps  xmm5, [esp + jzO]
+       movaps [esp + dxOH2], xmm0
+       movaps [esp + dyOH2], xmm1
+       movaps [esp + dzOH2], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1O], xmm3
+       movaps [esp + dyH1O], xmm4
+       movaps [esp + dzH1O], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqOH2], xmm0
+       movaps [esp + rsqH1O], xmm3
+
+       movaps xmm0, [esp + ixH1]
+       movaps xmm1, [esp + iyH1]
+       movaps xmm2, [esp + izH1]
+       movaps xmm3, [esp + ixH1]
+       movaps xmm4, [esp + iyH1]
+       movaps xmm5, [esp + izH1]
+       subps  xmm0, [esp + jxH1]
+       subps  xmm1, [esp + jyH1]
+       subps  xmm2, [esp + jzH1]
+       subps  xmm3, [esp + jxH2]
+       subps  xmm4, [esp + jyH2]
+       subps  xmm5, [esp + jzH2]
+       movaps [esp + dxH1H1], xmm0
+       movaps [esp + dyH1H1], xmm1
+       movaps [esp + dzH1H1], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH1H2], xmm3
+       movaps [esp + dyH1H2], xmm4
+       movaps [esp + dzH1H2], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm3, xmm4
+       addps  xmm3, xmm5
+       movaps [esp + rsqH1H1], xmm0
+       movaps [esp + rsqH1H2], xmm3
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       movaps xmm3, [esp + ixH2]
+       movaps xmm4, [esp + iyH2]
+       movaps xmm5, [esp + izH2]
+       subps  xmm0, [esp + jxO]
+       subps  xmm1, [esp + jyO]
+       subps  xmm2, [esp + jzO]
+       subps  xmm3, [esp + jxH1]
+       subps  xmm4, [esp + jyH1]
+       subps  xmm5, [esp + jzH1]
+       movaps [esp + dxH2O], xmm0
+       movaps [esp + dyH2O], xmm1
+       movaps [esp + dzH2O], xmm2
+       mulps  xmm0, xmm0
+       mulps  xmm1, xmm1
+       mulps  xmm2, xmm2
+       movaps [esp + dxH2H1], xmm3
+       movaps [esp + dyH2H1], xmm4
+       movaps [esp + dzH2H1], xmm5
+       mulps  xmm3, xmm3
+       mulps  xmm4, xmm4
+       mulps  xmm5, xmm5
+       addps  xmm0, xmm1
+       addps  xmm0, xmm2
+       addps  xmm4, xmm3
+       addps  xmm4, xmm5
+       movaps [esp + rsqH2O], xmm0
+       movaps [esp + rsqH2H1], xmm4
+
+       movaps xmm0, [esp + ixH2]
+       movaps xmm1, [esp + iyH2]
+       movaps xmm2, [esp + izH2]
+       subps  xmm0, [esp + jxH2]
+       subps  xmm1, [esp + jyH2]
+       subps  xmm2, [esp + jzH2]
+       movaps [esp + dxH2H2], xmm0
+       movaps [esp + dyH2H2], xmm1
+       movaps [esp + dzH2H2], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2
+       movaps [esp + rsqH2H2], xmm0
+               
+       /* start doing invsqrt use rsq values in xmm0, xmm4 */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinvH2H2 */
+       mulps   xmm7, [esp + half] /* rinvH2H1 */
+       movaps  [esp + rinvH2H2], xmm3
+       movaps  [esp + rinvH2H1], xmm7
+               
+       rsqrtps xmm1, [esp + rsqOO]
+       rsqrtps xmm5, [esp + rsqOH1]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOO]
+       mulps   xmm5, [esp + rsqOH1]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOO], xmm3
+       movaps  [esp + rinvOH1], xmm7
+       
+       rsqrtps xmm1, [esp + rsqOH2]
+       rsqrtps xmm5, [esp + rsqH1O]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqOH2]
+       mulps   xmm5, [esp + rsqH1O]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvOH2], xmm3
+       movaps  [esp + rinvH1O], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH1H1]
+       rsqrtps xmm5, [esp + rsqH1H2]
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, [esp + rsqH1H1]
+       mulps   xmm5, [esp + rsqH1H2]
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] 
+       mulps   xmm7, [esp + half]
+       movaps  [esp + rinvH1H1], xmm3
+       movaps  [esp + rinvH1H2], xmm7
+       
+       rsqrtps xmm1, [esp + rsqH2O]
+       movaps  xmm2, xmm1
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, [esp + rsqH2O]
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2
+       mulps   xmm3, [esp + half] 
+       movaps  [esp + rinvH2O], xmm3
+
+       /* start with OO interaction */
+       movaps xmm0, [esp + rinvOO]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOO] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+               
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+       
+        movd mm0, eax
+        movd mm1, ebx
+        movd mm2, ecx
+        movd mm3, edx
+
+        mov  esi, [ebp + VFtab]
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOO]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+        /* increment vcoul - then we can get rid of mm5 */
+        /* update vctot */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5 
+
+        /* put scalar force on stack temporarily */
+        movaps [esp + fstmp], xmm3
+
+        /* dispersion */
+        movlps xmm5, [esi + eax*4 + 16]
+        movlps xmm7, [esi + ecx*4 + 16]
+        movhps xmm5, [esi + ebx*4 + 16]
+        movhps xmm7, [esi + edx*4 + 16] /* got half dispersion table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 24]
+        movlps xmm3, [esi + ecx*4 + 24]
+        movhps xmm7, [esi + ebx*4 + 24]
+        movhps xmm3, [esi + edx*4 + 24] /* other half of dispersion table */
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* dispersion table ready, in xmm4-xmm7 */
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+
+        movaps xmm4, [esp + c6]
+        mulps  xmm7, xmm4        /* fijD */
+        mulps  xmm5, xmm4        /* vnb6 */
+        addps  xmm7, [esp + fstmp] /* add to fscal */
+
+        /* put scalar force on stack Update vnbtot directly */
+        addps  xmm5, [esp + vnbtot]
+        movaps [esp + fstmp], xmm7
+        movaps [esp + vnbtot], xmm5
+
+        /* repulsion */
+        movlps xmm5, [esi + eax*4 + 32]
+        movlps xmm7, [esi + ecx*4 + 32]
+        movhps xmm5, [esi + ebx*4 + 32]
+        movhps xmm7, [esi + edx*4 + 32] /* got half repulsion table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 40]
+        movlps xmm3, [esi + ecx*4 + 40]
+        movhps xmm7, [esi + ebx*4 + 40]
+        movhps xmm3, [esi + edx*4 + 40] /* other half of repulsion table */
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* table ready, in xmm4-xmm7 */
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+ 
+        movaps xmm4, [esp + c12]
+        mulps  xmm7, xmm4 /* fijR */
+        mulps  xmm5, xmm4 /* vnb12 */
+        addps  xmm7, [esp + fstmp] 
+
+        addps  xmm5, [esp + vnbtot]
+        movaps [esp + vnbtot], xmm5
+        xorps  xmm1, xmm1
+
+        mulps xmm7, [esp + tsc]
+        mulps xmm7, xmm0
+        subps  xmm1, xmm7
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1               
+
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOO]
+       mulps xmm1, [esp + dyOO]
+       mulps xmm2, [esp + dzOO]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H1 interaction */
+       movaps xmm0, [esp + rinvOH1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOH1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH1]
+       mulps xmm1, [esp + dyOH1]
+       mulps xmm2, [esp + dzOH1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* O-H2 interaction */ 
+       movaps xmm0, [esp + rinvOH2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqOH2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       xorps xmm3, xmm3
+       movaps xmm4, xmm3
+       movaps xmm5, xmm3
+       mulps xmm0, [esp + dxOH2]
+       mulps xmm1, [esp + dyOH2]
+       mulps xmm2, [esp + dzOH2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixO]
+       addps xmm1, [esp + fiyO]
+       addps xmm2, [esp + fizO]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixO], xmm0
+       movaps [esp + fiyO], xmm1
+       movaps [esp + fizO], xmm2
+
+       /* H1-O interaction */
+       movaps xmm0, [esp + rinvH1O]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1O] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH1O]
+       mulps xmm1, [esp + dyH1O]
+       mulps xmm2, [esp + dzH1O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H1 interaction */
+       movaps xmm0, [esp + rinvH1H1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1H1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH1H1]
+       mulps xmm1, [esp + dyH1H1]
+       mulps xmm2, [esp + dzH1H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H1-H2 interaction */
+       movaps xmm0, [esp + rinvH1H2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH1H2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH1H2]
+       mulps xmm1, [esp + dyH1H2]
+       mulps xmm2, [esp + dzH1H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH1]
+       addps xmm1, [esp + fiyH1]
+       addps xmm2, [esp + fizH1]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH1], xmm0
+       movaps [esp + fiyH1], xmm1
+       movaps [esp + fizH1], xmm2
+
+       /* H2-O interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2O] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqOH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+
+       movaps xmm3, [esp + fjxO]
+       movaps xmm4, [esp + fjyO]
+       movaps xmm5, [esp + fjzO]
+       mulps xmm0, [esp + dxH2O]
+       mulps xmm1, [esp + dyH2O]
+       mulps xmm2, [esp + dzH2O]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxO], xmm3
+       movaps [esp + fjyO], xmm4
+       movaps [esp + fjzO], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H1 interaction */
+       movaps xmm0, [esp + rinvH2H1]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2H1] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH1]
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       mulps xmm0, [esp + dxH2H1]
+       mulps xmm1, [esp + dyH2H1]
+       mulps xmm2, [esp + dzH2H1]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH1], xmm3
+       movaps [esp + fjyH1], xmm4
+       movaps [esp + fjzH1], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       /* H2-H2 interaction */
+       movaps xmm0, [esp + rinvH2H2]
+       movaps xmm1, xmm0
+       mulps  xmm1, [esp + rsqH2H2] /* xmm1=r */
+       mulps  xmm1, [esp + tsc]        
+       movhlps xmm2, xmm1
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+
+        movd eax, mm6
+        psrlq mm6, 32
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd ebx, mm6
+        movd edx, mm7
+
+        lea   eax, [eax + eax*2]
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+
+        movlps xmm5, [esi + eax*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm5, [esi + ebx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + eax*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm7, [esi + ebx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+        movaps xmm3, [esp + qqHH]
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point mm5 contains vcoul and mm3 fijC */
+
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+       xorps  xmm1, xmm1
+       mulps  xmm3,  [esp + tsc]
+       mulps  xmm3, xmm0
+       subps  xmm1, xmm3
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1
+       
+       movaps xmm3, [esp + fjxH2]
+       movaps xmm4, [esp + fjyH2]
+       movaps xmm5, [esp + fjzH2]
+       mulps xmm0, [esp + dxH2H2]
+       mulps xmm1, [esp + dyH2H2]
+       mulps xmm2, [esp + dzH2H2]
+       subps xmm3, xmm0
+       subps xmm4, xmm1
+       subps xmm5, xmm2
+       addps xmm0, [esp + fixH2]
+       addps xmm1, [esp + fiyH2]
+       addps xmm2, [esp + fizH2]
+       movaps [esp + fjxH2], xmm3
+       movaps [esp + fjyH2], xmm4
+       movaps [esp + fjzH2], xmm5
+       movaps [esp + fixH2], xmm0
+       movaps [esp + fiyH2], xmm1
+       movaps [esp + fizH2], xmm2
+
+       mov edi, [ebp +faction]
+
+       movd eax, mm0
+       movd ebx, mm1
+       movd ecx, mm2
+       movd edx, mm3
+       
+       /* Did all interactions - now update j forces */
+       /* 4 j waters with three atoms each - first do a & b j particles */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpcklps xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjxOb  fjyOb */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOb  fjyOb */ 
+       unpcklps xmm1, xmm2        /* xmm1= fjzOa  fjxH1a fjzOb  fjxH1b */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpcklps xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjyH1b fjzH1b */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1b fjzH1b */
+       unpcklps xmm5, xmm6        /* xmm5= fjxH2a fjyH2a fjxH2b fjyH2b */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOa  fjyOa  fjzOa  fjxH1a */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOb  fjyOb  fjzOb  fjxH1b */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1a fjzH1a fjxH2a fjyH2a */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1b fjzH1b fjxH2b fjyH2b */
+       movups   xmm1, [edi + eax*4]
+       movups   xmm2, [edi + eax*4 + 16]
+       movups   xmm5, [edi + ebx*4]
+       movups   xmm6, [edi + ebx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + eax*4 + 32]
+       movss    xmm3, [edi + ebx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm7, xmm7, 1
+       
+       movups   [edi + eax*4],     xmm1
+       movups   [edi + eax*4 + 16],xmm2
+       movups   [edi + ebx*4],     xmm5
+       movups   [edi + ebx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + eax*4 + 32], xmm0
+       movss    [edi + ebx*4 + 32], xmm3       
+
+       /* then do the second pair (c & d) */
+       movaps xmm0, [esp + fjxO] /* xmm0= fjxOa  fjxOb  fjxOc  fjxOd */
+       movaps xmm1, [esp + fjyO] /* xmm1= fjyOa  fjyOb  fjyOc  fjyOd */ 
+       unpckhps xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjxOd  fjyOd */
+       movaps xmm1, [esp + fjzO]
+       movaps xmm2, [esp + fjxH1]
+       movhlps  xmm3, xmm0        /* xmm3= fjxOd  fjyOd */ 
+       unpckhps xmm1, xmm2        /* xmm1= fjzOc  fjxH1c fjzOd  fjxH1d */
+       movaps xmm4, [esp + fjyH1]
+       movaps xmm5, [esp + fjzH1]
+       unpckhps xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjyH1d fjzH1d */
+       movaps xmm5, [esp + fjxH2]
+       movaps xmm6, [esp + fjyH2]
+       movhlps  xmm7, xmm4        /* xmm7= fjyH1d fjzH1d */     
+       unpckhps xmm5, xmm6        /* xmm5= fjxH2c fjyH2c fjxH2d fjyH2d */
+       movlhps  xmm0, xmm1        /* xmm0= fjxOc  fjyOc  fjzOc  fjxH1c */
+       shufps   xmm3, xmm1, 0b11100100
+                                   /* xmm3= fjxOd  fjyOd fjzOd  fjxH1d */
+       movlhps  xmm4, xmm5        /* xmm4= fjyH1c fjzH1c fjxH2c fjyH2c  */
+       shufps   xmm7, xmm5, 0b11100100
+                                   /* xmm7= fjyH1d fjzH1d fjxH2d fjyH2d */
+       movups   xmm1, [edi + ecx*4]
+       movups   xmm2, [edi + ecx*4 + 16]
+       movups   xmm5, [edi + edx*4]
+       movups   xmm6, [edi + edx*4 + 16]
+       addps    xmm1, xmm0
+       addps    xmm2, xmm4
+       addps    xmm5, xmm3
+       addps    xmm6, xmm7
+       movss    xmm0, [edi + ecx*4 + 32]
+       movss    xmm3, [edi + edx*4 + 32]
+       
+       movaps   xmm4, [esp + fjzH2]
+       movaps   xmm7, xmm4
+       shufps   xmm4, xmm4, 0b10
+       shufps   xmm7, xmm7, 0b11
+       movups   [edi + ecx*4],     xmm1
+       movups   [edi + ecx*4 + 16],xmm2
+       movups   [edi + edx*4],     xmm5
+       movups   [edi + edx*4 + 16],xmm6        
+       addss    xmm0, xmm4
+       addss    xmm3, xmm7
+       movss    [edi + ecx*4 + 32], xmm0
+       movss    [edi + edx*4 + 32], xmm3       
+       
+       /* should we do one more iteration? */
+       sub   [esp + innerk],  4
+       jl    .i3330_single_check
+       jmp   .i3330_unroll_loop
+.i3330_single_check:
+       add   [esp + innerk],  4
+       jnz   .i3330_single_loop
+       jmp   .i3330_updateouterdata
+.i3330_single_loop:
+       mov   edx, [esp + innerjjnr]     /* pointer to jjnr[k] */
+       mov   eax, [edx]        
+       add   [esp + innerjjnr],  4     
+
+       mov esi, [ebp + pos]
+       lea   eax, [eax + eax*2]  
+
+       /* fetch j coordinates */
+       xorps xmm3, xmm3
+       xorps xmm4, xmm4
+       xorps xmm5, xmm5
+       movss xmm3, [esi + eax*4]
+       movss xmm4, [esi + eax*4 + 4]
+       movss xmm5, [esi + eax*4 + 8]
+
+       movlps xmm6, [esi + eax*4 + 12]
+       movhps xmm6, [esi + eax*4 + 24] /* xmm6=jxH1 jyH1 jxH2 jyH2 */
+       /* fetch both z coords in one go, to positions 0 and 3 in xmm7 */
+       movups xmm7, [esi + eax*4 + 20] /* xmm7=jzH1 jxH2 jyH2 jzH2 */
+       shufps xmm6, xmm6, 0b11011000    /* xmm6=jxH1 jxH2 jyH1 jyH2 */
+       movlhps xmm3, xmm6              /* xmm3= jxO   0  jxH1 jxH2 */
+       movaps  xmm0, [esp + ixO]     
+       movaps  xmm1, [esp + iyO]
+       movaps  xmm2, [esp + izO]       
+       shufps  xmm4, xmm6, 0b11100100 /* xmm4= jyO   0   jyH1 jyH2 */
+       shufps xmm5, xmm7, 0b11000100  /* xmm5= jzO   0   jzH1 jzH2 */
+       /* store all j coordinates in jO */ 
+       movaps [esp + jxO], xmm3
+       movaps [esp + jyO], xmm4
+       movaps [esp + jzO], xmm5
+       subps  xmm0, xmm3
+       subps  xmm1, xmm4
+       subps  xmm2, xmm5
+       movaps [esp + dxOO], xmm0
+       movaps [esp + dyOO], xmm1
+       movaps [esp + dzOO], xmm2
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       addps xmm0, xmm1
+       addps xmm0, xmm2        /* have rsq in xmm0 */
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       movaps  xmm2, xmm1      
+       mulps   xmm1, xmm1
+       movaps  xmm3, [esp + three]
+       mulps   xmm1, xmm0
+       subps   xmm3, xmm1
+       mulps   xmm3, xmm2                                                      
+       mulps   xmm3, [esp + half] /* rinv iO - j water */
+
+       movaps  xmm1, xmm3
+       mulps   xmm1, xmm0      /* xmm1=r */
+       movaps  xmm0, xmm3      /* xmm0=rinv */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+       mov esi, [ebp + VFtab]
+       
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+       
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOO]
+       movhps  xmm3, [esp + qqOH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+       
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5
+        /* put scalar force on stack temporarily */
+        movaps [esp + fstmp], xmm3
+
+        /* dispersion */
+       movss  xmm4, [esi + ebx*4 + 16] 
+       movss  xmm5, [esi + ebx*4 + 20] 
+       movss  xmm6, [esi + ebx*4 + 24] 
+       movss  xmm7, [esi + ebx*4 + 28]
+        /* dispersion table ready, in xmm4-xmm7 */
+        mulss  xmm6, xmm1       /* xmm6=Geps */
+        mulss  xmm7, xmm2       /* xmm7=Heps2 */
+        addss  xmm5, xmm6
+        addss  xmm5, xmm7       /* xmm5=Fp */
+        mulss  xmm7, [esp + two]       /* two*Heps2 */
+        addss  xmm7, xmm6
+        addss  xmm7, xmm5 /* xmm7=FF */
+        mulss  xmm5, xmm1 /* xmm5=eps*Fp */
+        addss  xmm5, xmm4 /* xmm5=VV */
+       xorps  xmm4, xmm4
+        movss  xmm4, [esp + c6]
+        mulps  xmm7, xmm4        /* fijD */
+        mulps  xmm5, xmm4        /* vnb6 */
+        addps  xmm7, [esp + fstmp] /* add to fscal */
+
+        /* put scalar force on stack Update vnbtot directly */
+        addps  xmm5, [esp + vnbtot]
+        movaps [esp + fstmp], xmm7
+        movaps [esp + vnbtot], xmm5
+
+        /* repulsion */
+       movss  xmm4, [esi + ebx*4 + 32] 
+       movss  xmm5, [esi + ebx*4 + 36] 
+       movss  xmm6, [esi + ebx*4 + 40] 
+       movss  xmm7, [esi + ebx*4 + 44]
+        /* table ready, in xmm4-xmm7 */
+        mulss  xmm6, xmm1       /* xmm6=Geps */
+        mulss  xmm7, xmm2       /* xmm7=Heps2 */
+        addss  xmm5, xmm6
+        addss  xmm5, xmm7       /* xmm5=Fp */
+        mulss  xmm7, [esp + two]       /* two*Heps2 */
+        addss  xmm7, xmm6
+        addss  xmm7, xmm5 /* xmm7=FF */
+        mulss  xmm5, xmm1 /* xmm5=eps*Fp */
+        addss  xmm5, xmm4 /* xmm5=VV */
+
+       xorps  xmm4, xmm4
+        movss  xmm4, [esp + c12]
+        mulps  xmm7, xmm4 /* fijR */
+        mulps  xmm5, xmm4 /* vnb12 */
+        addps  xmm7, [esp + fstmp] 
+
+        addps  xmm5, [esp + vnbtot]
+        movaps [esp + vnbtot], xmm5
+        xorps  xmm1, xmm1
+
+        mulps xmm7, [esp + tsc]
+        mulps xmm7, xmm0
+        subps  xmm1, xmm7
+
+       movaps xmm0, xmm1
+       movaps xmm2, xmm1               
+
+       mulps   xmm0, [esp + dxOO]
+       mulps   xmm1, [esp + dyOO]
+       mulps   xmm2, [esp + dzOO]
+       /* initial update for j forces */
+       xorps   xmm3, xmm3
+       xorps   xmm4, xmm4
+       xorps   xmm5, xmm5
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixO]
+       addps   xmm1, [esp + fiyO]
+       addps   xmm2, [esp + fizO]
+       movaps  [esp + fixO], xmm0
+       movaps  [esp + fiyO], xmm1
+       movaps  [esp + fizO], xmm2
+
+       
+       /* done with i O Now do i H1 & H2 simultaneously first get i particle coords: */
+       movaps  xmm0, [esp + ixH1]
+       movaps  xmm1, [esp + iyH1]
+       movaps  xmm2, [esp + izH1]      
+       movaps  xmm3, [esp + ixH2] 
+       movaps  xmm4, [esp + iyH2] 
+       movaps  xmm5, [esp + izH2] 
+       subps   xmm0, [esp + jxO]
+       subps   xmm1, [esp + jyO]
+       subps   xmm2, [esp + jzO]
+       subps   xmm3, [esp + jxO]
+       subps   xmm4, [esp + jyO]
+       subps   xmm5, [esp + jzO]
+       movaps [esp + dxH1O], xmm0
+       movaps [esp + dyH1O], xmm1
+       movaps [esp + dzH1O], xmm2
+       movaps [esp + dxH2O], xmm3
+       movaps [esp + dyH2O], xmm4
+       movaps [esp + dzH2O], xmm5
+       mulps xmm0, xmm0
+       mulps xmm1, xmm1
+       mulps xmm2, xmm2
+       mulps xmm3, xmm3
+       mulps xmm4, xmm4
+       mulps xmm5, xmm5
+       addps xmm0, xmm1
+       addps xmm4, xmm3
+       addps xmm0, xmm2        /* have rsqH1 in xmm0 */
+       addps xmm4, xmm5        /* have rsqH2 in xmm4 */
+
+       /* start with H1, save H2 data */
+       movaps [esp + rsqH2O], xmm4
+       
+       /* do invsqrt */
+       rsqrtps xmm1, xmm0
+       rsqrtps xmm5, xmm4
+       movaps  xmm2, xmm1
+       movaps  xmm6, xmm5
+       mulps   xmm1, xmm1
+       mulps   xmm5, xmm5
+       movaps  xmm3, [esp + three]
+       movaps  xmm7, xmm3
+       mulps   xmm1, xmm0
+       mulps   xmm5, xmm4
+       subps   xmm3, xmm1
+       subps   xmm7, xmm5
+       mulps   xmm3, xmm2
+       mulps   xmm7, xmm6
+       mulps   xmm3, [esp + half] /* rinv H1 - j water */
+       mulps   xmm7, [esp + half] /* rinv H2 - j water */ 
+
+       /* start with H1, save H2 data */
+       movaps [esp + rinvH2O], xmm7
+
+       movaps xmm1, xmm3
+       mulps  xmm1, xmm0       /* xmm1=r */
+       movaps xmm0, xmm3       /* xmm0=rinv */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+       
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOH]
+       movhps  xmm3, [esp + qqHH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5     
+
+        xorps  xmm1, xmm1
+
+        mulps xmm3, [esp + tsc]
+        mulps xmm3, xmm0
+        subps  xmm1, xmm3
+       
+       movaps  xmm0, xmm1
+       movaps  xmm2, xmm1
+       mulps   xmm0, [esp + dxH1O]
+       mulps   xmm1, [esp + dyH1O]
+       mulps   xmm2, [esp + dzH1O]
+       /* update forces H1 - j water */
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH1]
+       addps   xmm1, [esp + fiyH1]
+       addps   xmm2, [esp + fizH1]
+       movaps  [esp + fixH1], xmm0
+       movaps  [esp + fiyH1], xmm1
+       movaps  [esp + fizH1], xmm2
+       /* do table for H2 - j water interaction */
+       movaps xmm0, [esp + rinvH2O]
+       movaps xmm1, [esp + rsqH2O]
+       mulps  xmm1, xmm0       /* xmm0=rinv, xmm1=r */
+       mulps  xmm1, [esp + tsc]
+       
+       movhlps xmm2, xmm1      
+        cvttps2pi mm6, xmm1
+        cvttps2pi mm7, xmm2     /* mm6/mm7 contain lu indices */
+        cvtpi2ps xmm3, mm6
+        cvtpi2ps xmm2, mm7
+       movlhps  xmm3, xmm2
+       subps    xmm1, xmm3     /* xmm1=eps */
+        movaps xmm2, xmm1
+        mulps  xmm2, xmm2       /* xmm2=eps2 */
+        pslld mm6, 2
+        pslld mm7, 2
+        movd ebx, mm6
+        movd ecx, mm7
+        psrlq mm7, 32
+        movd edx, mm7          /* table indices in ebx,ecx,edx */
+
+        lea   ebx, [ebx + ebx*2]
+        lea   ecx, [ecx + ecx*2]
+        lea   edx, [edx + edx*2]
+       
+        movlps xmm5, [esi + ebx*4]
+        movlps xmm7, [esi + ecx*4]
+        movhps xmm7, [esi + edx*4] /* got half coulomb table */
+        movaps xmm4, xmm5
+        shufps xmm4, xmm7, 0b10001000
+        shufps xmm5, xmm7, 0b11011101
+
+        movlps xmm7, [esi + ebx*4 + 8]
+        movlps xmm3, [esi + ecx*4 + 8]
+        movhps xmm3, [esi + edx*4 + 8] /* other half of coulomb table */ 
+        movaps xmm6, xmm7
+        shufps xmm6, xmm3, 0b10001000
+        shufps xmm7, xmm3, 0b11011101
+        /* coulomb table ready, in xmm4-xmm7 */ 
+        mulps  xmm6, xmm1       /* xmm6=Geps */
+        mulps  xmm7, xmm2       /* xmm7=Heps2 */
+        addps  xmm5, xmm6
+        addps  xmm5, xmm7       /* xmm5=Fp */
+        mulps  xmm7, [esp + two]       /* two*Heps2 */
+
+       xorps  xmm3, xmm3
+       /* fetch charges to xmm3 (temporary) */
+       movss   xmm3, [esp + qqOH]
+       movhps  xmm3, [esp + qqHH]
+               
+        addps  xmm7, xmm6
+        addps  xmm7, xmm5 /* xmm7=FF */
+        mulps  xmm5, xmm1 /* xmm5=eps*Fp */
+        addps  xmm5, xmm4 /* xmm5=VV */
+        mulps  xmm5, xmm3 /* vcoul=qq*VV */ 
+        mulps  xmm3, xmm7 /* fijC=FF*qq */
+        /* at this point xmm5 contains vcoul and xmm3 fijC */
+        addps  xmm5, [esp + vctot]
+        movaps [esp + vctot], xmm5     
+
+        xorps  xmm1, xmm1
+
+        mulps xmm3, [esp + tsc]
+        mulps xmm3, xmm0
+        subps  xmm1, xmm3
+       
+       movaps  xmm0, xmm1
+       movaps  xmm2, xmm1
+       
+       mulps   xmm0, [esp + dxH2O]
+       mulps   xmm1, [esp + dyH2O]
+       mulps   xmm2, [esp + dzH2O]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       subps   xmm3, xmm0
+       subps   xmm4, xmm1
+       subps   xmm5, xmm2
+       mov     esi, [ebp + faction]
+       movaps  [esp + fjxO], xmm3
+       movaps  [esp + fjyO], xmm4
+       movaps  [esp + fjzO], xmm5
+       addps   xmm0, [esp + fixH2]
+       addps   xmm1, [esp + fiyH2]
+       addps   xmm2, [esp + fizH2]
+       movaps  [esp + fixH2], xmm0
+       movaps  [esp + fiyH2], xmm1
+       movaps  [esp + fizH2], xmm2
+
+       /* update j water forces from local variables */
+       movlps  xmm0, [esi + eax*4]
+       movlps  xmm1, [esi + eax*4 + 12]
+       movhps  xmm1, [esi + eax*4 + 24]
+       movaps  xmm3, [esp + fjxO]
+       movaps  xmm4, [esp + fjyO]
+       movaps  xmm5, [esp + fjzO]
+       movaps  xmm6, xmm5
+       movaps  xmm7, xmm5
+       shufps  xmm6, xmm6, 0b10
+       shufps  xmm7, xmm7, 0b11
+       addss   xmm5, [esi + eax*4 + 8]
+       addss   xmm6, [esi + eax*4 + 20]
+       addss   xmm7, [esi + eax*4 + 32]
+       movss   [esi + eax*4 + 8], xmm5
+       movss   [esi + eax*4 + 20], xmm6
+       movss   [esi + eax*4 + 32], xmm7
+       movaps   xmm5, xmm3
+       unpcklps xmm3, xmm4
+       unpckhps xmm5, xmm4
+       addps    xmm0, xmm3
+       addps    xmm1, xmm5
+       movlps  [esi + eax*4], xmm0 
+       movlps  [esi + eax*4 + 12], xmm1 
+       movhps  [esi + eax*4 + 24], xmm1 
+       
+       dec dword ptr [esp + innerk]
+       jz    .i3330_updateouterdata
+       jmp   .i3330_single_loop
+.i3330_updateouterdata:
+       mov   ecx, [esp + ii3]
+       mov   edi, [ebp + faction]
+       mov   esi, [ebp + fshift]
+       mov   edx, [esp + is3]
+
+       /* accumulate  Oi forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixO]
+       movaps xmm1, [esp + fiyO] 
+       movaps xmm2, [esp + fizO]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4]
+       movss  xmm4, [edi + ecx*4 + 4]
+       movss  xmm5, [edi + ecx*4 + 8]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4],     xmm3
+       movss  [edi + ecx*4 + 4], xmm4
+       movss  [edi + ecx*4 + 8], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       movaps xmm6, xmm0
+       movss xmm7, xmm2
+       movlhps xmm6, xmm1
+       shufps  xmm6, xmm6, 0b1000      
+
+       /* accumulate H1i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH1]
+       movaps xmm1, [esp + fiyH1]
+       movaps xmm2, [esp + fizH1]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 in xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 12]
+       movss  xmm4, [edi + ecx*4 + 16]
+       movss  xmm5, [edi + ecx*4 + 20]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 12], xmm3
+       movss  [edi + ecx*4 + 16], xmm4
+       movss  [edi + ecx*4 + 20], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* accumulate H2i forces in xmm0, xmm1, xmm2 */
+       movaps xmm0, [esp + fixH2]
+       movaps xmm1, [esp + fiyH2]
+       movaps xmm2, [esp + fizH2]
+
+       movhlps xmm3, xmm0
+       movhlps xmm4, xmm1
+       movhlps xmm5, xmm2
+       addps  xmm0, xmm3
+       addps  xmm1, xmm4
+       addps  xmm2, xmm5 /* sum is in 1/2 i xmm0-xmm2 */
+
+       movaps xmm3, xmm0       
+       movaps xmm4, xmm1       
+       movaps xmm5, xmm2       
+
+       shufps xmm3, xmm3, 1
+       shufps xmm4, xmm4, 1
+       shufps xmm5, xmm5, 1
+       addss  xmm0, xmm3
+       addss  xmm1, xmm4
+       addss  xmm2, xmm5       /* xmm0-xmm2 has single force in pos0 */
+
+       /* increment i force */
+       movss  xmm3, [edi + ecx*4 + 24]
+       movss  xmm4, [edi + ecx*4 + 28]
+       movss  xmm5, [edi + ecx*4 + 32]
+       addss  xmm3, xmm0
+       addss  xmm4, xmm1
+       addss  xmm5, xmm2
+       movss  [edi + ecx*4 + 24], xmm3
+       movss  [edi + ecx*4 + 28], xmm4
+       movss  [edi + ecx*4 + 32], xmm5
+
+       /* accumulate force in xmm6/xmm7 for fshift */
+       addss xmm7, xmm2
+       movlhps xmm0, xmm1
+       shufps  xmm0, xmm0, 0b1000      
+       addps   xmm6, xmm0
+
+       /* increment fshift force */ 
+       movlps  xmm3, [esi + edx*4]
+       movss  xmm4, [esi + edx*4 + 8]
+       addps  xmm3, xmm6
+       addss  xmm4, xmm7
+       movlps  [esi + edx*4],    xmm3
+       movss  [esi + edx*4 + 8], xmm4
+
+       /* get group index for i particle */
+       mov   edx, [ebp + gid]      /* get group index for this i particle */
+       mov   edx, [edx]
+       add   [ebp + gid],  4  /* advance pointer */
+
+       /* accumulate total potential energy and update it */
+       movaps xmm7, [esp + vctot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vc]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* accumulate total lj energy and update it */
+       movaps xmm7, [esp + vnbtot]
+       /* accumulate */
+       movhlps xmm6, xmm7
+       addps  xmm7, xmm6       /* pos 0-1 in xmm7 have the sum now */
+       movaps xmm6, xmm7
+       shufps xmm6, xmm6, 1
+       addss  xmm7, xmm6               
+
+       /* add earlier value from mem */
+       mov   eax, [ebp + Vnb]
+       addss xmm7, [eax + edx*4] 
+       /* move back to mem */
+       movss [eax + edx*4], xmm7 
+       
+       /* finish if last */
+       mov   ecx, [ebp + nri]
+       dec ecx
+       jecxz .i3330_end
+       /* not last, iterate once more! */ 
+       mov [ebp + nri], ecx
+       jmp .i3330_outer
+.i3330_end:
+       emms
+       mov eax, [esp + salign]
+       add esp, eax
+       add esp, 1508
+       pop edi
+       pop esi
+        pop edx
+        pop ecx
+        pop ebx
+        pop eax
+       leave
+       ret
+
diff --git a/src/gmxlib/x86_sse.asm b/src/gmxlib/x86_sse.asm

deleted file mode 100644 (file)

index 11d1080..0000000
--- a/src/gmxlib/x86_sse.asm
+++ /dev/null
@@ -1,37643 +0,0 @@
-;;
-;;                This source code is part of
-;;
-;;                 G   R   O   M   A   C   S
-;;
-;;          GROningen MAchine for Chemical Simulations
-;;
-;;                        VERSION 3.0
-;;
-;; Copyright (c) 1991-2001
-;; BIOSON Research Institute, Dept. of Biophysical Chemistry
-;; University of Groningen, The Netherlands
-;;
-;; This program is free software; you can redistribute it and/or
-;; modify it under the terms of the GNU General Public License
-;; as published by the Free Software Foundation; either version 2
-;; of the License, or (at your option) any later version.
-;;
-;; If you want to redistribute modifications, please consider that
-;; scientific software is very special. Version control is crucial -
-;; bugs must be traceable. We will be happy to consider code for
-;; inclusion in the official distribution, but derived work must not
-;; be called official GROMACS. Details are found in the README & COPYING
-;; files - if they are missing, get the official version at www.gromacs.org.
-;;
-;; To help us fund GROMACS development, we humbly ask that you cite
-;; the papers on the package - you can find them in the top README file.
-;;
-;; Do check out http://www.gromacs.org , or mail us at gromacs@gromacs.org .
-;;
-;; And Hey:
-;; GROup of MAchos and Cynical Suckers
-
-; NASM macro set to make interfacing to 32-bit programs easier -*- nasm -*-
-%imacro proc 1                  ; begin a procedure definition
-%push proc
-          global %1
-%1:       push ebp
-          mov ebp,esp
-%assign %$arg 8
-%define %$procname %1
-%endmacro
-
-
-
-%imacro arg 0-1 4               ; used with the argument name as a label
-%00       equ %$arg
-%assign %$arg %1+%$arg
-%endmacro
-
-
-
-%imacro endproc 0
-%ifnctx proc
-%error Mismatched `endproc'/`proc'
-
-%else
-          leave
-          ret
-__end_%$procname:               ; useful for calculating function size
-
-%pop
-%endif
-%endmacro
-
-
-segment .data
-
-sse_minushalf
-       dd -0.5
-       dd -0.5
-       dd -0.5
-       dd -0.5
-sse_half
-       dd 0.5
-       dd 0.5
-       dd 0.5
-       dd 0.5
-sse_two
-       dd 2.0
-       dd 2.0
-       dd 2.0
-       dd 2.0
-sse_three
-       dd 3.0
-       dd 3.0
-       dd 3.0
-       dd 3.0
-sse_six
-       dd 6.0
-       dd 6.0
-       dd 6.0
-       dd 6.0
-sse_twelve
-       dd 12.0
-       dd 12.0
-       dd 12.0
-       dd 12.0
-
-
-segment .text
-
-       global checksse         ;  tries to issue a simple SSE instruction
-checksse:
-       emms
-       xorps xmm0,xmm0
-       emms
-       ret
-
-align 16
-       global vecinvsqrt_sse
-vecinvsqrt_sse
-       push ebp
-       mov ebp,esp     
-       push eax
-       push ebx
-       push ecx
-       push edx
-
-       mov eax, [ebp + 8]
-       mov ebx, [ebp + 12]     
-       mov ecx, [ebp + 16]
-        mov edx, ecx
-       movups xmm6,[sse_three]
-       movups xmm7,[sse_half]
-        shr ecx, 3
-        jecxz .iter4
-        emms   
-.loop8:        
-       movaps xmm0,[eax]
-       add eax, byte 16
-       rsqrtps xmm1,xmm0
-       movaps xmm2,[eax]
-       add eax, byte 16
-       rsqrtps xmm3,xmm2
-       mulps xmm0,xmm1
-        mulps xmm2,xmm3
-       mulps xmm0,xmm1
-        mulps xmm2,xmm3
-       subps xmm0,xmm6
-       subps xmm2,xmm6
-       mulps xmm0,xmm1
-       mulps xmm2,xmm3
-       mulps xmm0,xmm7
-       mulps xmm2,xmm7
-       movaps [ebx],xmm0
-       add ebx, byte 16
-       movaps [ebx],xmm2
-       add ebx, byte 16
-        dec ecx
-        jecxz .iter4
-        jmp .loop8
-.iter4:
-        mov ecx,edx
-        and ecx,4
-        jecxz .iter2
-       movaps xmm0,[eax]
-       add eax, byte 16
-       rsqrtps xmm1,xmm0
-       mulps xmm0,xmm1
-       mulps xmm0,xmm1
-       subps xmm0,xmm6
-       mulps xmm0,xmm1
-       mulps xmm0,xmm7
-       movaps [ebx],xmm0
-       add ebx, byte 16        
-.iter2:
-        mov ecx,edx
-        and ecx,2
-        jecxz .iter1
-       movlps xmm0,[eax]
-       add eax, byte 8
-       rsqrtps xmm1,xmm0
-       mulps xmm0,xmm1
-       mulps xmm0,xmm1
-       subps xmm0,xmm6
-       mulps xmm0,xmm1
-       mulps xmm0,xmm7
-       movlps [ebx],xmm0
-       add ebx, byte 8     
-.iter1:
-        mov ecx,edx
-        and ecx,1
-        jecxz .end
-       movss xmm0,[eax]
-       rsqrtss xmm1,xmm0
-       mulss xmm0,xmm1
-       mulss xmm0,xmm1
-       subss xmm0,xmm6
-       mulss xmm0,xmm1
-       mulss xmm0,xmm7
-       movss [ebx],xmm0        
-.end:  
-       emms
-       pop edx
-       pop ecx
-       pop ebx
-       pop eax
-       leave
-       ret
-       
-       global vecrecip_sse
-vecrecip_sse
-       push ebp
-       mov ebp,esp     
-       push eax
-       push ebx
-       push ecx
-       push edx
-
-       mov eax, [ebp + 8]
-       mov ebx, [ebp + 12]     
-       mov ecx, [ebp + 16]
-        mov edx, ecx
-       movups xmm6,[sse_two]
-        shr ecx, 3
-        jecxz .iter4
-        emms   
-.loop8:        
-       movaps xmm0,[eax]
-       add eax, byte 16
-       rcpps xmm1,xmm0
-       movaps xmm3,[eax]
-       add eax, byte 16
-       rcpps xmm4,xmm3
-       movaps xmm2,xmm6
-       mulps xmm0,xmm1
-       movaps xmm5,xmm6        
-       subps xmm2,xmm0
-       mulps xmm3,xmm4
-       mulps xmm2,xmm1 
-       subps xmm5,xmm3 
-       movaps [ebx],xmm2
-       mulps xmm5,xmm4
-       add ebx, byte 16
-       movaps [ebx],xmm5
-       add ebx, byte 16
-        dec ecx
-        jecxz .iter4
-        jmp .loop8
-.iter4:
-        mov ecx,edx
-        and ecx,4
-        jecxz .iter2
-       movaps xmm0,[eax]
-       add eax, byte 16
-       rcpps xmm1,xmm0
-       movaps xmm2,xmm6
-       mulps xmm0,xmm1         
-       subps xmm2,xmm0
-       mulps xmm2,xmm1
-       movaps [ebx],xmm2
-       add ebx, byte 16        
-.iter2:
-        mov ecx,edx
-        and ecx,2
-        jecxz .iter1
-       movlps xmm0,[eax]
-       add eax, byte 8
-       rcpps xmm1,xmm0
-       movaps xmm2,xmm6
-       mulps xmm0,xmm1         
-       subps xmm2,xmm0
-       mulps xmm2,xmm1
-       movlps [ebx],xmm2
-       add ebx, byte 8     
-.iter1:
-        mov ecx,edx
-        and ecx,1
-        jecxz .end
-       movss xmm0,[eax]
-       rcpss xmm1,xmm0
-       movss xmm2,xmm6
-       mulss xmm0,xmm1         
-       subss xmm2,xmm0
-       mulss xmm2,xmm1
-       movss [ebx],xmm2        
-.end:  
-       emms
-       pop edx
-       pop ecx
-       pop ebx
-       pop eax
-       leave
-       ret
-       
-       
-proc inl0100_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg             
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.dx          equ    48
-.dy          equ    64
-.dz          equ    80
-.two         equ    96         
-.c6          equ   112
-.c12         equ   128
-.six         equ   144
-.twelve      equ   160          
-.vnbtot      equ   176
-.fix         equ   192
-.fiy         equ   208
-.fiz         equ   224
-.half        equ   240
-.three       equ   256
-.is3         equ   272
-.ii3         equ   276
-.ntia       equ   280  
-.innerjjnr   equ   284
-.innerk      equ   288
-.salign             equ   292                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 296            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movaps [esp + .two], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vnbtot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm2
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       xorps  xmm7,xmm7
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:                     
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]                
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       xorps  xmm6, xmm6
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 296
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-proc inl0110_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg             
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.dx          equ    48
-.dy          equ    64
-.dz          equ    80
-.two         equ    96         
-.c6          equ   112
-.c12         equ   128
-.six         equ   144
-.twelve      equ   160          
-.vnbtot      equ   176
-.fix         equ   192
-.fiy         equ   208
-.fiz         equ   224
-.half        equ   240
-.three       equ   256
-.is3         equ   272
-.ii3         equ   276
-.shX        equ   280
-.shY         equ   284
-.shZ         equ   288
-.ntia       equ   292  
-.innerjjnr0  equ   296
-.innerjjnr   equ   300
-.innerk0     equ   304
-.innerk      equ   308
-.salign             equ   312                                          
-.nsvdwc      equ   316
-.nscoul      equ   320
-.nsvdw       equ   324
-.solnr      equ   328          
-       
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 332            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movaps [esp + .two], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movlps xmm0, [eax + ebx*4]      ; getting the shiftvector
-       movss xmm1, [eax + ebx*4 + 8] 
-       movlps [esp + .shX], xmm0
-       movss [esp + .shZ], xmm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-
-       ; clear vnbtot 
-       xorps xmm4, xmm4
-       movaps [esp + .vnbtot], xmm4
-       mov   [esp + .solnr],  ebx
-               
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testvdw
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       ; clear  i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ; quad-unroll innerloop here
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm2
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdwc
-       jmp   .checksingle_vdwc
-.dopair_vdwc:  
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       xorps  xmm7,xmm7
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdwc:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdwc
-       jmp    .updateouterdata_vdwc
-.dosingle_vdwc:                        
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]                
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       xorps  xmm6, xmm6
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdwc:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testvdw
-       jmp .mno_vdwc
-.testvdw
-       mov  ebx,  [esp + .nscoul]
-       add  [esp + .solnr], dword ebx
-
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm2
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdw
-       jmp   .checksingle_vdw
-.dopair_vdw:   
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       xorps  xmm7,xmm7
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdw:                              
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdw
-       jmp    .updateouterdata_vdw
-.dosingle_vdw:                 
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]                
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       xorps  xmm6, xmm6
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdw:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-       
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-       
-.last_mno:     
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 332
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl0300_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.dx          equ    48
-.dy          equ    64
-.dz          equ    80
-.two        equ    96
-.tabscale    equ   112
-.c6          equ   128
-.c12         equ   144
-.fs          equ   160
-.vnbtot      equ   176
-.fix         equ   192
-.fiy         equ   208
-.fiz         equ   224
-.half        equ   240
-.three       equ   256
-.is3         equ   272
-.ii3         equ   276
-.ntia       equ   280  
-.innerjjnr   equ   284
-.innerk      equ   288
-.salign             equ   292                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 296            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three],  xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear tot potential and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 3
-       pslld mm7, 3
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       ; dispersion
-       movlps xmm5, [esi + eax*4 + 0]
-       movlps xmm7, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + ebx*4 + 0]
-       movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + eax*4 + 16]
-       movlps xmm7, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + ebx*4 + 16]
-       movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 24]
-       movlps xmm3, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + ebx*4 + 24]
-       movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 3
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       ; dispersion
-       movlps xmm5, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 3
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       ; dispersion
-       movlps xmm4, [esi + ebx*4 + 0]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm4, [esi + ebx*4 + 16]
-       movlps xmm6, [esi + ebx*4 + 24]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 296
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-proc inl0310_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg             
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-%$tabscale      arg
-%$VFtab         arg
-%$nsatoms       arg
-       ;; stack offsets for local variables
-        ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.dx          equ    48
-.dy          equ    64
-.dz          equ    80
-.two         equ    96   
-.tabscale    equ   112
-.c6          equ   128
-.c12         equ   144
-.fs          equ   160
-.vnbtot      equ   176
-.fix         equ   192
-.fiy         equ   208
-.fiz         equ   224
-.half        equ   240
-.three       equ   256
-.is3         equ   272
-.ii3         equ   276
-.shX        equ   280
-.shY         equ   284
-.shZ         equ   288
-.ntia       equ   292  
-.innerjjnr0  equ   296
-.innerjjnr   equ   300
-.innerk0     equ   304
-.innerk      equ   308
-.salign             equ   312                                          
-.nsvdwc      equ   316
-.nscoul      equ   320
-.nsvdw       equ   324
-.solnr      equ   328
-        push eax      
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 332            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three], xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movlps xmm0, [eax + ebx*4]      ; getting the shiftvector
-       movss xmm1, [eax + ebx*4 + 8] 
-       movlps [esp + .shX], xmm0
-       movss [esp + .shZ], xmm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-
-       ; clear vnbtot 
-       xorps xmm4, xmm4
-       movaps [esp + .vnbtot], xmm4
-       mov   [esp + .solnr],  ebx
-               
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testvdw
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       ; clear  i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 3
-       pslld mm7, 3
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       ; dispersion
-       movlps xmm5, [esi + eax*4 + 0]
-       movlps xmm7, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + ebx*4 + 0]
-       movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + eax*4 + 16]
-       movlps xmm7, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + ebx*4 + 16]
-       movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 24]
-       movlps xmm3, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + ebx*4 + 24]
-       movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdwc
-       jmp   .checksingle_vdwc
-.dopair_vdwc:  
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 3
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       ; dispersion
-       movlps xmm5, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-
-       movlps xmm7, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdwc:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdwc
-       jmp    .updateouterdata_vdwc
-.dosingle_vdwc:
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 3
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       ; dispersion
-       movlps xmm4, [esi + ebx*4 + 0]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm4, [esi + ebx*4 + 16]
-       movlps xmm6, [esi + ebx*4 + 24]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdwc:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testvdw
-       jmp .mno_vdwc
-.testvdw
-       mov  ebx,  [esp + .nscoul]
-       add  [esp + .solnr], dword ebx
-
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 3
-       pslld mm7, 3
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       ; dispersion
-       movlps xmm5, [esi + eax*4 + 0]
-       movlps xmm7, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + ebx*4 + 0]
-       movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + eax*4 + 16]
-       movlps xmm7, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + ebx*4 + 16]
-       movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 24]
-       movlps xmm3, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + ebx*4 + 24]
-       movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdw
-       jmp   .checksingle_vdw
-.dopair_vdw:   
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 3
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       ; dispersion
-       movlps xmm5, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-
-       movlps xmm7, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdw:                              
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdw
-       jmp    .updateouterdata_vdw
-.dosingle_vdw:
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 3
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       ; dispersion
-       movlps xmm4, [esi + ebx*4 + 0]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm4, [esi + ebx*4 + 16]
-       movlps xmm6, [esi + ebx*4 + 24]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5         
-.updateouterdata_vdw:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-       
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw    
-.last_mno:     
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 332
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl1000_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.vctot       equ   112
-.fix         equ   128
-.fiy         equ   144
-.fiz         equ   160
-.half        equ   176
-.three       equ   192
-.is3         equ   208
-.ii3         equ   212
-.innerjjnr   equ   216
-.innerk      equ   220         
-.salign             equ   224                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 228            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unrolled innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm5, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b          
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm5
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]      ;x1 y1 - -
-       movlps xmm5, [esi + ecx*4]      ;x3 y3 - -
-       movss xmm2, [esi + eax*4 + 8]   ;z1 -  - -
-       movss xmm6, [esi + ecx*4 + 8]   ;z3 -  - -
-
-       movhps xmm4, [esi + ebx*4]      ;x1 y1 x2 y2
-       movhps xmm5, [esi + edx*4]      ;x3 y3 x4 y4
-
-       movss xmm0, [esi + ebx*4 + 8]   ;z2 - - -
-       movss xmm1, [esi + edx*4 + 8]   ;z4 - - -
-
-       shufps xmm2, xmm0, 0b           ;z1 z1 z2 z2
-       shufps xmm6, xmm1, 0b           ;z3 z3 z4 z4
-       
-       movaps xmm0, xmm4               ;x1 y1 x2 y2    
-       movaps xmm1, xmm4               ;x1 y1 x2 y2
-
-       shufps xmm2, xmm6, 10001000b    ;z1 z2 z3 z4
-       
-       shufps xmm0, xmm5, 10001000b    ;x1 x2 x3 x4
-       shufps xmm1, xmm5, 11011101b    ;y1 y2 y3 y4            
-
-       mov    edi, [ebp + %$faction]
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       mulps  xmm3, [esp + .iq]
-       xorps  xmm7,xmm7
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's 
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-       
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:                     
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       movss xmm3, [esi + eax*4]       ; xmm3(0) has the charge        
-       
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
- 
-       mulps  xmm3, [esp + .iq]
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       mov    edi, [ebp + %$faction]
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj     
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec dword ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 228
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl1010_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.vctot       equ   112
-.fix         equ   128
-.fiy         equ   144
-.fiz         equ   160
-.half        equ   176
-.three       equ   192
-.is3         equ   208
-.ii3         equ   212
-.shX        equ   216
-.shY         equ   220
-.shZ         equ   224
-.ntia       equ   228  
-.innerjjnr0  equ   232
-.innerk0     equ   236
-.innerjjnr   equ   240
-.innerk      equ   244         
-.salign             equ   248                                          
-.nscoul      equ   252
-.solnr      equ   256          
-       
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 260            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       add   [ebp + %$nsatoms], dword 8
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-       movss [esp + .shX], xmm0
-       movss [esp + .shY], xmm1
-       movss [esp + .shZ], xmm2
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       mov   ecx, [eax]
-       add   [ebp + %$nsatoms], dword 12
-       mov   [esp + .nscoul], ecx      
-
-       ; clear vctot 
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       mov   [esp + .solnr], ebx
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-
-       mov   ecx, [esp + .nscoul]
-       cmp   ecx, dword 0
-       jnz   .mno_coul
-       jmp   .last_mno
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear  i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-
-.unroll_coul_loop:     
-       ;; quad-unrolled innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm5, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b          
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm5
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       mov    edi, [ebp + %$faction]
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_coul
-       jmp   .checksingle_coul
-.dopair_coul:  
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       mulps  xmm3, [esp + .iq]
-       xorps  xmm7,xmm7
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's 
-       movss xmm3, [edi + eax*4]
-       movss xmm4, [edi + eax*4 + 4]
-       movss xmm5, [edi + eax*4 + 8]
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       subps  xmm5, xmm2
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-       
-       movss xmm3, [edi + ebx*4]
-       movss xmm4, [edi + ebx*4 + 4]
-       movss xmm5, [edi + ebx*4 + 8]
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       subps  xmm5, xmm2
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5
-
-.checksingle_coul:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_coul
-       jmp    .updateouterdata_coul
-.dosingle_coul:                        
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       movss xmm3, [esi + eax*4]       ; xmm3(0) has the charge        
-       
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
- 
-       mulps  xmm3, [esp + .iq]
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       mov    edi, [ebp + %$faction]
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj     
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-        subss   xmm5, xmm2
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_coul:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .last_mno
-       jmp .mno_coul
-       
-.last_mno:     
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 260
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl1020_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.iqO         equ   144 
-.iqH         equ   160 
-.dxO         equ   176
-.dyO         equ   192
-.dzO         equ   208 
-.dxH1        equ   224
-.dyH1        equ   240
-.dzH1        equ   256 
-.dxH2        equ   272
-.dyH2        equ   288
-.dzH2        equ   304 
-.qqO         equ   320
-.qqH         equ   336
-.vctot       equ   352
-.fixO        equ   368
-.fiyO        equ   384
-.fizO        equ   400
-.fixH1       equ   416
-.fiyH1       equ   432
-.fizH1       equ   448
-.fixH2       equ   464
-.fiyH2       equ   480
-.fizH2       equ   496
-.fjx        equ   512
-.fjy         equ   528
-.fjz         equ   544
-.half        equ   560
-.three       equ   576
-.is3         equ   592
-.ii3         equ   596
-.innerjjnr   equ   600
-.innerk      equ   604
-.salign             equ   608                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 612            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, [edx + ebx*4 + 4]   
-       movss xmm5, [ebp + %$facel]
-       mulss  xmm3, xmm5
-       mulss  xmm4, xmm5
-
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       movaps [esp + .iqO], xmm3
-       movaps [esp + .iqH], xmm4
-       
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .odd_inner
-.unroll_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movaps xmm4, xmm3            ;  and in xmm4
-       mulps  xmm3, [esp + .iqO]
-       mulps  xmm4, [esp + .iqH]
-
-       movaps  [esp + .qqO], xmm3
-       movaps  [esp + .qqH], xmm4      
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ixO-izO to xmm4-xmm6
-       movaps xmm4, [esp + .ixO]
-       movaps xmm5, [esp + .iyO]
-       movaps xmm6, [esp + .izO]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxO], xmm4
-       movaps [esp + .dyO], xmm5
-       movaps [esp + .dzO], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       movaps xmm7, xmm4
-       ; rsqO in xmm7
-
-       ; move ixH1-izH1 to xmm4-xmm6
-       movaps xmm4, [esp + .ixH1]
-       movaps xmm5, [esp + .iyH1]
-       movaps xmm6, [esp + .izH1]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxH1], xmm4
-       movaps [esp + .dyH1], xmm5
-       movaps [esp + .dzH1], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm6, xmm5
-       addps xmm6, xmm4
-       ; rsqH1 in xmm6
-
-       ; move ixH2-izH2 to xmm3-xmm5
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-
-       ; calc dr
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-
-       ; store dr
-       movaps [esp + .dxH2], xmm3
-       movaps [esp + .dyH2], xmm4
-       movaps [esp + .dzH2], xmm5
-       ; square it
-       mulps xmm3,xmm3
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       addps xmm5, xmm4
-       addps xmm5, xmm3
-       ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
-       ; start with rsqO - seed in xmm2        
-       rsqrtps xmm2, xmm7
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm7      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm7, xmm4      ; rinvO in xmm7
-       ; rsqH1 - seed in xmm2  
-       rsqrtps xmm2, xmm6
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm6      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm6, xmm4      ; rinvH1 in xmm6
-       ; rsqH2 - seed in xmm2  
-       rsqrtps xmm2, xmm5
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm5      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm5, xmm4      ; rinvH2 in xmm5
-
-       ; do O interactions
-       movaps  xmm4, xmm7      
-       mulps   xmm4, xmm4      ; xmm7=rinv, xmm4=rinvsq
-       mulps  xmm7, [esp + .qqO]       ;xmm7=vcoul
-       
-       mulps  xmm4, xmm7       ; total fsO in xmm4
-
-       addps  xmm7, [esp + .vctot]
-       
-       movaps [esp + .vctot], xmm7
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update O forces
-       movaps xmm3, [esp + .fixO]
-       movaps xmm4, [esp + .fiyO]
-       movaps xmm7, [esp + .fizO]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixO], xmm3
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm7
-       ; update j forces with water O
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H1 interactions
-       movaps  xmm4, xmm6      
-       mulps   xmm4, xmm4      ; xmm6=rinv, xmm4=rinvsq
-       mulps  xmm6, [esp + .qqH]       ;xmm6=vcoul
-       mulps  xmm4, xmm6               ; total fsH1 in xmm4
-       
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH1]
-       movaps xmm1, [esp + .dyH1]
-       movaps xmm2, [esp + .dzH1]
-       movaps [esp + .vctot], xmm6
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H1 forces
-       movaps xmm3, [esp + .fixH1]
-       movaps xmm4, [esp + .fiyH1]
-       movaps xmm7, [esp + .fizH1]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH1], xmm3
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm7
-       ; update j forces with water H1
-       addps  xmm0, [esp + .fjx]
-       addps  xmm1, [esp + .fjy]
-       addps  xmm2, [esp + .fjz]
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H2 interactions
-       movaps  xmm4, xmm5      
-       mulps   xmm4, xmm4      ; xmm5=rinv, xmm4=rinvsq
-       mulps  xmm5, [esp + .qqH]       ;xmm5=vcoul
-       mulps  xmm4, xmm5               ; total fsH1 in xmm4
-       
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH2]
-       movaps xmm1, [esp + .dyH2]
-       movaps xmm2, [esp + .dzH2]
-       movaps [esp + .vctot], xmm5
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H2 forces
-       movaps xmm3, [esp + .fixH2]
-       movaps xmm4, [esp + .fiyH2]
-       movaps xmm7, [esp + .fizH2]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH2], xmm3
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm7
-
-       mov edi, [ebp +%$faction]
-       ; update j forces
-       addps xmm0, [esp + .fjx]
-       addps xmm1, [esp + .fjy]
-       addps xmm2, [esp + .fjz]
-
-       movlps xmm4, [edi + eax*4]
-       movlps xmm7, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm7, [edi + edx*4]
-       
-       movaps xmm3, xmm4
-       shufps xmm3, xmm7, 10001000b
-       shufps xmm4, xmm7, 11011101b                          
-       ; xmm3 has fjx, xmm4 has fjy.
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       ; unpack the back for storing.
-       movaps xmm7, xmm3
-       unpcklps xmm7, xmm4
-       unpckhps xmm3, xmm4     
-       movlps [edi + eax*4], xmm7
-       movlps [edi + ecx*4], xmm3
-       movhps [edi + ebx*4], xmm7
-       movhps [edi + edx*4], xmm3
-       ; finally z forces 
-       movss  xmm0, [edi + eax*4 + 8]
-       movss  xmm1, [edi + ebx*4 + 8]
-       movss  xmm3, [edi + ecx*4 + 8]
-       movss  xmm4, [edi + edx*4 + 8]
-       subss  xmm0, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm1, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm3, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm4, xmm2
-       movss  [edi + eax*4 + 8], xmm0
-       movss  [edi + ebx*4 + 8], xmm1
-       movss  [edi + ecx*4 + 8], xmm3
-       movss  [edi + edx*4 + 8], xmm4
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .odd_inner
-       jmp   .unroll_loop
-.odd_inner:    
-       add   [esp + .innerk], dword 4
-       jnz   .odd_loop
-       jmp   .updateouterdata
-.odd_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       xorps xmm4, xmm4
-       movss xmm4, [esp + .iqO]
-       mov esi, [ebp + %$charge] 
-       movhps xmm4, [esp + .iqH]     
-       movss xmm3, [esi + eax*4]       ; charge in xmm3
-       shufps xmm3, xmm3, 0b
-       mulps xmm3, xmm4
-       movaps [esp + .qqO], xmm3       ; use oxygen qq for storage.
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-       
-       ; move j coords to xmm0-xmm2
-       movss xmm0, [esi + eax*4]
-       movss xmm1, [esi + eax*4 + 4]
-       movss xmm2, [esi + eax*4 + 8]
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       
-       movss xmm3, [esp + .ixO]
-       movss xmm4, [esp + .iyO]
-       movss xmm5, [esp + .izO]
-               
-       movlps xmm6, [esp + .ixH1]
-       movlps xmm7, [esp + .ixH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm3, xmm6
-       movlps xmm6, [esp + .iyH1]
-       movlps xmm7, [esp + .iyH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm4, xmm6
-       movlps xmm6, [esp + .izH1]
-       movlps xmm7, [esp + .izH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm5, xmm6
-
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       
-       movaps [esp + .dxO], xmm3
-       movaps [esp + .dyO], xmm4
-       movaps [esp + .dzO], xmm5
-
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       ; rsq in xmm4.
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm3, [esp + .qqO]
-
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-
-       movaps [esp + .vctot], xmm3
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       movss  xmm3, [esp + .fixO]      
-       movss  xmm4, [esp + .fiyO]      
-       movss  xmm5, [esp + .fizO]      
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esp + .fixO], xmm3      
-       movss  [esp + .fiyO], xmm4      
-       movss  [esp + .fizO], xmm5      ; updated the O force. now do the H's
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       shufps xmm3, xmm3, 11100110b    ; shift right
-       shufps xmm4, xmm4, 11100110b
-       shufps xmm5, xmm5, 11100110b
-       addss  xmm3, [esp + .fixH1]
-       addss  xmm4, [esp + .fiyH1]
-       addss  xmm5, [esp + .fizH1]
-       movss  [esp + .fixH1], xmm3     
-       movss  [esp + .fiyH1], xmm4     
-       movss  [esp + .fizH1], xmm5     ; updated the H1 force. 
-
-       mov edi, [ebp + %$faction]
-       shufps xmm3, xmm3, 11100111b    ; shift right
-       shufps xmm4, xmm4, 11100111b
-       shufps xmm5, xmm5, 11100111b
-       addss  xmm3, [esp + .fixH2]
-       addss  xmm4, [esp + .fiyH2]
-       addss  xmm5, [esp + .fizH2]
-       movss  [esp + .fixH2], xmm3     
-       movss  [esp + .fiyH2], xmm4     
-       movss  [esp + .fizH2], xmm5     ; updated the H2 force. 
-
-       ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
-       xorps  xmm5, xmm5
-       movaps xmm3, xmm0
-       movlps xmm6, [edi + eax*4]
-       movss  xmm7, [edi + eax*4 + 8]
-       unpcklps xmm3, xmm1
-       movlhps  xmm3, xmm5     
-       unpckhps xmm0, xmm1             
-       addps    xmm0, xmm3
-       movhlps  xmm3, xmm0     
-       addps    xmm0, xmm3     ; x,y sum in xmm0
-
-       movhlps  xmm1, xmm2
-       addss    xmm2, xmm1
-       shufps   xmm1, xmm1, 1b 
-       addss    xmm2, xmm1    ; z sum in xmm2
-       subps    xmm6, xmm0
-       subss    xmm7, xmm2
-       
-       movlps [edi + eax*4],     xmm6
-       movss  [edi + eax*4 + 8], xmm7
-
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .odd_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO]
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       mov   edx, [ebp + %$gid]  
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4    
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-        
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7       
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 612
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-proc inl1030_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use 
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.jxO        equ   144
-.jyO        equ   160
-.jzO         equ   176
-.jxH1       equ   192
-.jyH1       equ   208
-.jzH1        equ   224
-.jxH2       equ   240
-.jyH2       equ   256
-.jzH2        equ   272
-.dxOO        equ   288
-.dyOO        equ   304
-.dzOO        equ   320 
-.dxOH1       equ   336
-.dyOH1       equ   352
-.dzOH1       equ   368 
-.dxOH2       equ   384
-.dyOH2       equ   400
-.dzOH2       equ   416 
-.dxH1O       equ   432
-.dyH1O       equ   448
-.dzH1O       equ   464 
-.dxH1H1      equ   480
-.dyH1H1      equ   496
-.dzH1H1      equ   512 
-.dxH1H2      equ   528
-.dyH1H2      equ   544
-.dzH1H2      equ   560 
-.dxH2O       equ   576
-.dyH2O       equ   592
-.dzH2O       equ   608 
-.dxH2H1      equ   624
-.dyH2H1      equ   640
-.dzH2H1      equ   656 
-.dxH2H2      equ   672
-.dyH2H2      equ   688
-.dzH2H2      equ   704
-.qqOO        equ   720
-.qqOH        equ   736
-.qqHH        equ   752
-.vctot       equ   768         
-.fixO        equ   784
-.fiyO        equ   800
-.fizO        equ   816
-.fixH1       equ   832
-.fiyH1       equ   848
-.fizH1       equ   864
-.fixH2       equ   880
-.fiyH2       equ   896
-.fizH2       equ   912
-.fjxO       equ   928
-.fjyO        equ   944
-.fjzO        equ   960
-.fjxH1      equ   976
-.fjyH1       equ   992
-.fjzH1       equ  1008
-.fjxH2      equ  1024
-.fjyH2       equ  1040
-.fjzH2       equ  1056
-.half        equ  1072
-.three       equ  1088
-.rsqOO       equ  1104
-.rsqOH1      equ  1120
-.rsqOH2      equ  1136
-.rsqH1O      equ  1152
-.rsqH1H1     equ  1168
-.rsqH1H2     equ  1184
-.rsqH2O      equ  1200
-.rsqH2H1     equ  1216
-.rsqH2H2     equ  1232
-.rinvOO      equ  1248
-.rinvOH1     equ  1264
-.rinvOH2     equ  1280
-.rinvH1O     equ  1296
-.rinvH1H1    equ  1312
-.rinvH1H2    equ  1328
-.rinvH2O     equ  1344
-.rinvH2H1    equ  1360
-.rinvH2H2    equ  1376
-.is3         equ  1392
-.ii3         equ  1396
-.innerjjnr   equ  1400
-.innerk      equ  1404
-.salign             equ  1408                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 1412           ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, xmm3        
-       movss xmm5, [edx + ebx*4 + 4]   
-       movss xmm6, [ebp + %$facel]
-       mulss  xmm3, xmm3
-       mulss  xmm4, xmm5
-       mulss  xmm5, xmm5
-       mulss  xmm3, xmm6
-       mulss  xmm4, xmm6
-       mulss  xmm5, xmm6
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .qqOO], xmm3
-       movaps [esp + .qqOH], xmm4
-       movaps [esp + .qqHH], xmm5
-
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx 
-       
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .single_check
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4] 
-       mov   ecx, [edx + 8]
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-       
-       ; move j coordinates to local temp. variables
-       movlps xmm2, [esi + eax*4]
-       movlps xmm3, [esi + eax*4 + 12]
-       movlps xmm4, [esi + eax*4 + 24]
-
-       movlps xmm5, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 12]
-       movlps xmm7, [esi + ebx*4 + 24]
-
-       movhps xmm2, [esi + ecx*4]
-       movhps xmm3, [esi + ecx*4 + 12]
-       movhps xmm4, [esi + ecx*4 + 24]
-
-       movhps xmm5, [esi + edx*4]
-       movhps xmm6, [esi + edx*4 + 12]
-       movhps xmm7, [esi + edx*4 + 24]
-
-       ;; current state:       
-       ;;  xmm2= jxOa  jyOa  jxOc  jyOc
-       ;;  xmm3= jxH1a jyH1a jxH1c jyH1c
-       ;;  xmm4= jxH2a jyH2a jxH2c jyH2c
-       ;;  xmm5= jxOb  jyOb  jxOd  jyOd
-       ;;  xmm6= jxH1b jyH1b jxH1d jyH1d
-       ;;  xmm7= jxH2b jyH2b jxH2d jyH2d
-       
-       movaps xmm0, xmm2
-       movaps xmm1, xmm3
-       unpcklps xmm0, xmm5     ; xmm0= jxOa  jxOb  jyOa  jyOb
-       unpcklps xmm1, xmm6     ; xmm1= jxH1a jxH1b jyH1a jyH1b
-       unpckhps xmm2, xmm5     ; xmm2= jxOc  jxOd  jyOc  jyOd
-       unpckhps xmm3, xmm6     ; xmm3= jxH1c jxH1d jyH1c jyH1d 
-       movaps xmm5, xmm4
-       movaps   xmm6, xmm0
-       unpcklps xmm4, xmm7     ; xmm4= jxH2a jxH2b jyH2a jyH2b         
-       unpckhps xmm5, xmm7     ; xmm5= jxH2c jxH2d jyH2c jyH2d
-       movaps   xmm7, xmm1
-       movlhps  xmm0, xmm2     ; xmm0= jxOa  jxOb  jxOc  jxOd 
-       movaps [esp + .jxO], xmm0
-       movhlps  xmm2, xmm6     ; xmm2= jyOa  jyOb  jyOc  jyOd
-       movaps [esp + .jyO], xmm2
-       movlhps  xmm1, xmm3
-       movaps [esp + .jxH1], xmm1
-       movhlps  xmm3, xmm7
-       movaps   xmm6, xmm4
-       movaps [esp + .jyH1], xmm3
-       movlhps  xmm4, xmm5
-       movaps [esp + .jxH2], xmm4
-       movhlps  xmm5, xmm6
-       movaps [esp + .jyH2], xmm5
-
-       movss  xmm0, [esi + eax*4 + 8]
-       movss  xmm1, [esi + eax*4 + 20]
-       movss  xmm2, [esi + eax*4 + 32]
-
-       movss  xmm3, [esi + ecx*4 + 8]
-       movss  xmm4, [esi + ecx*4 + 20]
-       movss  xmm5, [esi + ecx*4 + 32]
-
-       movhps xmm0, [esi + ebx*4 + 4]
-       movhps xmm1, [esi + ebx*4 + 16]
-       movhps xmm2, [esi + ebx*4 + 28]
-       
-       movhps xmm3, [esi + edx*4 + 4]
-       movhps xmm4, [esi + edx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 28]
-       
-       shufps xmm0, xmm3, 11001100b
-       shufps xmm1, xmm4, 11001100b
-       shufps xmm2, xmm5, 11001100b
-       movaps [esp + .jzO],  xmm0
-       movaps [esp + .jzH1],  xmm1
-       movaps [esp + .jzH2],  xmm2
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixO]
-       movaps xmm4, [esp + .iyO]
-       movaps xmm5, [esp + .izO]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxOH1], xmm3
-       movaps [esp + .dyOH1], xmm4
-       movaps [esp + .dzOH1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOO], xmm0
-       movaps [esp + .rsqOH1], xmm3
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       subps  xmm3, [esp + .jxO]
-       subps  xmm4, [esp + .jyO]
-       subps  xmm5, [esp + .jzO]
-       movaps [esp + .dxOH2], xmm0
-       movaps [esp + .dyOH2], xmm1
-       movaps [esp + .dzOH2], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1O], xmm3
-       movaps [esp + .dyH1O], xmm4
-       movaps [esp + .dzH1O], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOH2], xmm0
-       movaps [esp + .rsqH1O], xmm3
-
-       movaps xmm0, [esp + .ixH1]
-       movaps xmm1, [esp + .iyH1]
-       movaps xmm2, [esp + .izH1]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH1]
-       subps  xmm1, [esp + .jyH1]
-       subps  xmm2, [esp + .jzH1]
-       subps  xmm3, [esp + .jxH2]
-       subps  xmm4, [esp + .jyH2]
-       subps  xmm5, [esp + .jzH2]
-       movaps [esp + .dxH1H1], xmm0
-       movaps [esp + .dyH1H1], xmm1
-       movaps [esp + .dzH1H1], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1H2], xmm3
-       movaps [esp + .dyH1H2], xmm4
-       movaps [esp + .dzH1H2], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqH1H1], xmm0
-       movaps [esp + .rsqH1H2], xmm3
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxH2O], xmm0
-       movaps [esp + .dyH2O], xmm1
-       movaps [esp + .dzH2O], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH2H1], xmm3
-       movaps [esp + .dyH2H1], xmm4
-       movaps [esp + .dzH2H1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       movaps [esp + .rsqH2O], xmm0
-       movaps [esp + .rsqH2H1], xmm4
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       movaps [esp + .dxH2H2], xmm0
-       movaps [esp + .dyH2H2], xmm1
-       movaps [esp + .dzH2H2], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2
-       movaps [esp + .rsqH2H2], xmm0
-               
-       ; start doing invsqrt. use rsq values in xmm0, xmm4
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinvH2H2
-       mulps   xmm7, [esp + .half] ; rinvH2H1
-       movaps  [esp + .rinvH2H2], xmm3
-       movaps  [esp + .rinvH2H1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOO]
-       rsqrtps xmm5, [esp + .rsqOH1]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOO]
-       mulps   xmm5, [esp + .rsqOH1]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOO], xmm3
-       movaps  [esp + .rinvOH1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOH2]
-       rsqrtps xmm5, [esp + .rsqH1O]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOH2]
-       mulps   xmm5, [esp + .rsqH1O]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOH2], xmm3
-       movaps  [esp + .rinvH1O], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH1H1]
-       rsqrtps xmm5, [esp + .rsqH1H2]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqH1H1]
-       mulps   xmm5, [esp + .rsqH1H2]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvH1H1], xmm3
-       movaps  [esp + .rinvH1H2], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH2O]
-       movaps  xmm2, xmm1
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, [esp + .rsqH2O]
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2
-       mulps   xmm3, [esp + .half] 
-       movaps  [esp + .rinvH2O], xmm3
-
-       ;; start with OO interaction.
-       movaps xmm0, [esp + .rinvOO]
-       movaps xmm7, xmm0
-       mulps  xmm0, xmm0
-       mulps  xmm7, [esp + .qqOO]
-       mulps  xmm0, xmm7       
-       addps  xmm7, [esp + .vctot] 
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOO]
-       mulps xmm1, [esp + .dyOO]
-       mulps xmm2, [esp + .dzOO]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H1 interaction
-       movaps xmm0, [esp + .rinvOH1]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsOH1 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH1]
-       mulps xmm1, [esp + .dyOH1]
-       mulps xmm2, [esp + .dzOH1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H2 interaction
-       movaps xmm0, [esp + .rinvOH2]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsOH2 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH2]
-       mulps xmm1, [esp + .dyOH2]
-       mulps xmm2, [esp + .dzOH2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; H1-O interaction
-       movaps xmm0, [esp + .rinvH1O]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsH1O 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH1O]
-       mulps xmm1, [esp + .dyH1O]
-       mulps xmm2, [esp + .dzH1O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H1 interaction
-       movaps xmm0, [esp + .rinvH1H1]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsH1H1 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH1H1]
-       mulps xmm1, [esp + .dyH1H1]
-       mulps xmm2, [esp + .dzH1H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H2 interaction
-       movaps xmm0, [esp + .rinvH1H2]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsOH2 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH1H2]
-       mulps xmm1, [esp + .dyH1H2]
-       mulps xmm2, [esp + .dzH1H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H2-O interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsH2O 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH2O]
-       mulps xmm1, [esp + .dyH2O]
-       mulps xmm2, [esp + .dzH2O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H1 interaction
-       movaps xmm0, [esp + .rinvH2H1]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsH2H1 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH2H1]
-       mulps xmm1, [esp + .dyH2H1]
-       mulps xmm2, [esp + .dzH2H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H2 interaction
-       movaps xmm0, [esp + .rinvH2H2]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsH2H2 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps [esp + .vctot], xmm7
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH2H2]
-       mulps xmm1, [esp + .dyH2H2]
-       mulps xmm2, [esp + .dzH2H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       mov edi, [ebp +%$faction]
-               
-       ; Did all interactions - now update j forces.
-       ; 4 j waters with three atoms each - first do a & b j particles
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd 
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpcklps xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjxOb  fjyOb
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOb  fjyOb
-       unpcklps xmm1, xmm2        ; xmm1= fjzOa  fjxH1a fjzOb  fjxH1b
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpcklps xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1b fjzH1b
-       unpcklps xmm5, xmm6        ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
-       movlhps  xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjzOa  fjxH1a  
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOb  fjyOb  fjzOb  fjxH1b
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
-       movups   xmm1, [edi + eax*4]
-       movups   xmm2, [edi + eax*4 + 16]
-       movups   xmm5, [edi + ebx*4]
-       movups   xmm6, [edi + ebx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + eax*4 + 32]
-       movss    xmm3, [edi + ebx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm7, xmm7, 1b
-       
-       movups   [edi + eax*4],     xmm1
-       movups   [edi + eax*4 + 16],xmm2
-       movups   [edi + ebx*4],     xmm5
-       movups   [edi + ebx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + eax*4 + 32], xmm0
-       movss    [edi + ebx*4 + 32], xmm3       
-
-       ;; then do the second pair (c & d)
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpckhps xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjxOd  fjyOd
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOd  fjyOd
-       unpckhps xmm1, xmm2        ; xmm1= fjzOc  fjxH1c fjzOd  fjxH1d
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpckhps xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d  
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1d fjzH1d         
-       unpckhps xmm5, xmm6        ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
-       movlhps  xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjzOc  fjxH1c 
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOd  fjyOd  fjzOd  fjxH1d
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c 
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
-       movups   xmm1, [edi + ecx*4]
-       movups   xmm2, [edi + ecx*4 + 16]
-       movups   xmm5, [edi + edx*4]
-       movups   xmm6, [edi + edx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + ecx*4 + 32]
-       movss    xmm3, [edi + edx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm4, xmm4, 10b
-       shufps   xmm7, xmm7, 11b
-       movups   [edi + ecx*4],     xmm1
-       movups   [edi + ecx*4 + 16],xmm2
-       movups   [edi + edx*4],     xmm5
-       movups   [edi + edx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + ecx*4 + 32], xmm0
-       movss    [edi + edx*4 + 32], xmm3       
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .single_check
-       jmp   .unroll_loop
-.single_check:
-       add   [esp + .innerk], dword 4
-       jnz   .single_loop
-       jmp   .updateouterdata
-.single_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-
-       ; fetch j coordinates
-       xorps xmm3, xmm3
-       xorps xmm4, xmm4
-       xorps xmm5, xmm5
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + eax*4 + 4]
-       movss xmm5, [esi + eax*4 + 8]
-
-       movlps xmm6, [esi + eax*4 + 12]
-       movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
-       ;;  fetch both z coords in one go, to positions 0 and 3 in xmm7
-       movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
-       shufps xmm6, xmm6, 11011000b    ;  xmm6=jxH1 jxH2 jyH1 jyH2
-       movlhps xmm3, xmm6              ; xmm3= jxO   0  jxH1 jxH2 
-       movaps  xmm0, [esp + .ixO]     
-       movaps  xmm1, [esp + .iyO]
-       movaps  xmm2, [esp + .izO]      
-       shufps  xmm4, xmm6, 11100100b ;  xmm4= jyO   0   jyH1 jyH2
-       shufps xmm5, xmm7, 11000100b  ;  xmm5= jzO   0   jzH1 jzH2  
-       ;;  store all j coordinates in jO
-       movaps [esp + .jxO], xmm3
-       movaps [esp + .jyO], xmm4
-       movaps [esp + .jzO], xmm5
-       subps  xmm0, xmm3
-       subps  xmm1, xmm4
-       subps  xmm2, xmm5
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2        ;  have rsq in xmm0.
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       movaps  xmm2, xmm1      
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, xmm0
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2                                                      
-       mulps   xmm3, [esp + .half] ; rinv iO - j water
-
-       xorps   xmm1, xmm1
-       movaps  xmm0, xmm3
-       xorps   xmm4, xmm4
-       mulps   xmm0, xmm0      ; xmm0=rinvsq
-       ;; fetch charges to xmm4 (temporary)
-       movss   xmm4, [esp + .qqOO]
-
-       movhps  xmm4, [esp + .qqOH]
-
-       mulps   xmm3, xmm4      ; xmm3=vcoul
-       mulps   xmm0, xmm3      ;  total fscal
-       addps   xmm3, [esp + .vctot]
-       movaps  [esp + .vctot], xmm3    
-
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxOO]
-       mulps   xmm1, [esp + .dyOO]
-       mulps   xmm2, [esp + .dzOO]
-       ;; initial update for j forces
-       xorps   xmm3, xmm3
-       xorps   xmm4, xmm4
-       xorps   xmm5, xmm5
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixO]
-       addps   xmm1, [esp + .fiyO]
-       addps   xmm2, [esp + .fizO]
-       movaps  [esp + .fixO], xmm0
-       movaps  [esp + .fiyO], xmm1
-       movaps  [esp + .fizO], xmm2
-
-       
-       ;;  done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
-       movaps  xmm0, [esp + .ixH1]
-       movaps  xmm1, [esp + .iyH1]
-       movaps  xmm2, [esp + .izH1]     
-       movaps  xmm3, [esp + .ixH2] 
-       movaps  xmm4, [esp + .iyH2] 
-       movaps  xmm5, [esp + .izH2] 
-       subps   xmm0, [esp + .jxO]
-       subps   xmm1, [esp + .jyO]
-       subps   xmm2, [esp + .jzO]
-       subps   xmm3, [esp + .jxO]
-       subps   xmm4, [esp + .jyO]
-       subps   xmm5, [esp + .jzO]
-       movaps [esp + .dxH1O], xmm0
-       movaps [esp + .dyH1O], xmm1
-       movaps [esp + .dzH1O], xmm2
-       movaps [esp + .dxH2O], xmm3
-       movaps [esp + .dyH2O], xmm4
-       movaps [esp + .dzH2O], xmm5
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       mulps xmm3, xmm3
-       mulps xmm4, xmm4
-       mulps xmm5, xmm5
-       addps xmm0, xmm1
-       addps xmm4, xmm3
-       addps xmm0, xmm2        ;  have rsqH1 in xmm0.
-       addps xmm4, xmm5        ;  have rsqH2 in xmm4.
-
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinv H1 - j water
-       mulps   xmm7, [esp + .half] ; rinv H2 - j water
-
-       ;; assemble charges in xmm6
-       xorps   xmm6, xmm6
-       ; do coulomb interaction
-       movaps  xmm0, xmm3
-       movss   xmm6, [esp + .qqOH]
-       movaps  xmm4, xmm7
-       movhps  xmm6, [esp + .qqHH]
-       mulps   xmm0, xmm0      ;  rinvsq
-       mulps   xmm4, xmm4      ;  rinvsq
-       mulps   xmm3, xmm6      ;  vcoul
-       mulps   xmm7, xmm6      ;  vcoul
-       movaps  xmm2, xmm3
-       addps   xmm2, xmm7      ;  total vcoul
-       mulps   xmm0, xmm3      ;  fscal
-       
-       addps   xmm2, [esp + .vctot]
-       mulps   xmm7, xmm4      ;  fscal
-       movaps  [esp + .vctot], xmm2
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxH1O]
-       mulps   xmm1, [esp + .dyH1O]
-       mulps   xmm2, [esp + .dzH1O]
-       ;;  update forces H1 - j water
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH1]
-       addps   xmm1, [esp + .fiyH1]
-       addps   xmm2, [esp + .fizH1]
-       movaps  [esp + .fixH1], xmm0
-       movaps  [esp + .fiyH1], xmm1
-       movaps  [esp + .fizH1], xmm2
-       ;; do forces H2 - j water
-       movaps xmm0, xmm7
-       movaps xmm1, xmm7
-       movaps xmm2, xmm7
-       mulps   xmm0, [esp + .dxH2O]
-       mulps   xmm1, [esp + .dyH2O]
-       mulps   xmm2, [esp + .dzH2O]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       mov     esi, [ebp + %$faction]
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH2]
-       addps   xmm1, [esp + .fiyH2]
-       addps   xmm2, [esp + .fizH2]
-       movaps  [esp + .fixH2], xmm0
-       movaps  [esp + .fiyH2], xmm1
-       movaps  [esp + .fizH2], xmm2
-
-       ;; update j water forces from local variables
-       movlps  xmm0, [esi + eax*4]
-       movlps  xmm1, [esi + eax*4 + 12]
-       movhps  xmm1, [esi + eax*4 + 24]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       movaps  xmm6, xmm5
-       movaps  xmm7, xmm5
-       shufps  xmm6, xmm6, 10b
-       shufps  xmm7, xmm7, 11b
-       addss   xmm5, [esi + eax*4 + 8]
-       addss   xmm6, [esi + eax*4 + 20]
-       addss   xmm7, [esi + eax*4 + 32]
-       movss   [esi + eax*4 + 8], xmm5
-       movss   [esi + eax*4 + 20], xmm6
-       movss   [esi + eax*4 + 32], xmm7
-       movaps   xmm5, xmm3
-       unpcklps xmm3, xmm4
-       unpckhps xmm5, xmm4
-       addps    xmm0, xmm3
-       addps    xmm1, xmm5
-       movlps  [esi + eax*4], xmm0 
-       movlps  [esi + eax*4 + 12], xmm1 
-       movhps  [esi + eax*4 + 24], xmm1 
-       
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .single_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO] 
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7       
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 1412
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-
-
-
-proc inl1100_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96 
-.c6          equ   112
-.c12         equ   128
-.six         equ   144
-.twelve      equ   160          
-.vctot       equ   176
-.vnbtot      equ   192
-.fix         equ   208
-.fiy         equ   224
-.fiz         equ   240
-.half        equ   256
-.three       equ   272
-.is3         equ   288
-.ii3         equ   292
-.ntia       equ   296  
-.innerjjnr   equ   300
-.innerk      equ   304
-.salign             equ   308                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, dword 312              ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm2
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm3
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       xorps xmm3, xmm3
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00001100b 
-       shufps xmm3, xmm3, 01011000b ; xmm3(0,1) has the charges.
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       xorps  xmm7,xmm7
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       mulps  xmm3, [esp + .iq]
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm3
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:                     
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       xorps xmm3, xmm3
-       mov   eax, [ecx]
-       movss xmm3, [esi + eax*4]       ; xmm3(0) has the charge        
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       xorps  xmm6, xmm6
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
- 
-       mulps  xmm3, [esp + .iq]
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm3
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, dword 312
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl2100_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$krf          arg     
-%$crf          arg     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96 
-.c6          equ   112
-.c12         equ   128
-.six         equ   144
-.twelve      equ   160          
-.vctot       equ   176
-.vnbtot      equ   192
-.fix         equ   208
-.fiy         equ   224
-.fiz         equ   240
-.half        equ   256
-.three       equ   272
-.two         equ   288
-.krf        equ   304   
-.crf        equ   320   
-.is3         equ   336
-.ii3         equ   340
-.ntia       equ   344
-.innerjjnr   equ   348
-.innerk      equ   352
-.salign             equ   356                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, dword 360              ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movups xmm4, [sse_two]
-       movss xmm5, [ebp + %$krf]
-       movss xmm6, [ebp + %$crf]
-       
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-       movaps [esp + .two], xmm4
-       shufps xmm5, xmm5, 0b
-       shufps xmm6, xmm6, 0b
-       movaps [esp + .krf], xmm5
-       movaps [esp + .crf], xmm6
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm2
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-       
-       movaps xmm7, [esp + .krf]
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       mulps  xmm7, xmm4       ;  xmm7=krsq
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm6, xmm0
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-       movaps xmm1, xmm4
-       subps  xmm6, [esp + .crf]
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm6, xmm3       ; xmm6=vcoul=qq*(rinv+krsq)
-       mulps  xmm7, [esp + .two]
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       subps  xmm0, xmm7
-       mulps  xmm3, xmm0
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm6
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       xorps xmm3, xmm3
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00001100b 
-       shufps xmm3, xmm3, 01011000b ; xmm3(0,1) has the charges.
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       xorps  xmm7,xmm7
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       mulps  xmm3, [esp + .iq]
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       movaps xmm7, [esp + .krf]
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       mulps  xmm7, xmm4       ;  xmm7=krsq
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm6, xmm0
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-       movaps xmm1, xmm4
-       subps  xmm6, [esp + .crf]
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm6, xmm3       ; xmm6=vcoul=qq*(rinv+krsq-crf)
-       mulps  xmm7, [esp + .two]       
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       subps  xmm0, xmm7
-       mulps  xmm3, xmm0       
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm6
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:                     
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       xorps xmm3, xmm3
-       mov   eax, [ecx]
-       movss xmm3, [esi + eax*4]       ; xmm3(0) has the charge        
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       xorps  xmm6, xmm6
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
- 
-       mulps  xmm3, [esp + .iq]
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       movaps xmm7, [esp + .krf]
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       mulps  xmm7, xmm4       ;  xmm7=krsq
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm6, xmm0
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-       movaps xmm1, xmm4
-       subps  xmm6, [esp + .crf]       
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm6, xmm3       ; xmm6=vcoul
-       mulps  xmm7, [esp + .two]
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       subps  xmm0, xmm7
-       mulps  xmm3, xmm0
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm6, [esp + .vctot]
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm6
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, dword 360
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl2000_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$krf          arg     
-%$crf           arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96 
-.vctot       equ   112
-.fix         equ   128
-.fiy         equ   144
-.fiz         equ   160
-.half        equ   176
-.three       equ   192
-.two         equ   208
-.krf        equ   224   
-.crf        equ   240   
-.is3         equ   256
-.ii3         equ   260
-.innerjjnr   equ   264
-.innerk      equ   268
-.salign             equ   272                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, dword 276              ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm4, [sse_two]
-       movss xmm5, [ebp + %$krf]
-       movss xmm6, [ebp + %$crf]
-       
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .two], xmm4
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .krf], xmm5
-       shufps xmm6, xmm6, 0b
-       movaps [esp + .crf], xmm6
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm2
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-       
-       movaps xmm7, [esp + .krf]
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       mulps  xmm7, xmm4       ;  xmm7=krsq
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm6, xmm0
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-
-       subps  xmm6, [esp + .crf] ;  xmm6=rinv+krsq-crf
-
-       mulps  xmm6, xmm3       ; xmm6=vcoul=qq*(rinv+krsq)
-       mulps  xmm7, [esp + .two]
-
-       subps  xmm0, xmm7
-       mulps  xmm3, xmm0       
-       mulps  xmm4, xmm3       ; xmm4=total fscal
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm6
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       xorps xmm3, xmm3
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00001100b 
-       shufps xmm3, xmm3, 01011000b ; xmm3(0,1) has the charges.       
-
-       mov edi, [ebp + %$pos]  
-                               
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       mulps  xmm3, [esp + .iq]
-
-       xorps  xmm7,xmm7
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       movaps xmm7, [esp + .krf]
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       mulps  xmm7, xmm4       ;  xmm7=krsq
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm6, xmm0
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-
-       subps  xmm6, [esp + .crf] ;  xmm6=rinv+krsq-crf
-
-       mulps  xmm6, xmm3       ; xmm6=vcoul=qq*(rinv+krsq-crf)
-       mulps  xmm7, [esp + .two]       
-
-       subps  xmm0, xmm7
-       mulps  xmm3, xmm0       
-
-       mulps  xmm4, xmm3       ; xmm4=total fscal
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm6
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:                     
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       xorps xmm3, xmm3
-       mov   eax, [ecx]
-       movss xmm3, [esi + eax*4]       ; xmm3(0) has the charge                
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
- 
-       mulps  xmm3, [esp + .iq]
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       movaps xmm7, [esp + .krf]
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       mulps  xmm7, xmm4       ;  xmm7=krsq
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm6, xmm0
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-
-       subps  xmm6, [esp + .crf] ;  xmm6=rinv+krsq-crf
-
-       mulps  xmm6, xmm3       ; xmm6=vcoul
-       mulps  xmm7, [esp + .two]
-
-       subps  xmm0, xmm7
-       mulps  xmm3, xmm0
-       mulps  xmm4, xmm3       ; xmm4=total fscal
-       addps  xmm6, [esp + .vctot]
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm6
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, dword 276
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-
-proc inl1110_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96 
-.c6          equ   112
-.c12         equ   128
-.two         equ   144
-.six         equ   160
-.twelve      equ   176          
-.vctot       equ   192
-.vnbtot      equ   208
-.fix         equ   224
-.fiy         equ   240
-.fiz         equ   256
-.half        equ   272
-.three       equ   288
-.is3         equ   304
-.ii3         equ   308
-.shX        equ   312
-.shY         equ   316
-.shZ         equ   320
-.ntia       equ   324  
-.innerjjnr0  equ   328
-.innerk0     equ   332
-.innerjjnr   equ   336
-.innerk      equ   340
-.salign             equ   344                                                  
-.nsvdwc      equ   348
-.nscoul      equ   352
-.nsvdw       equ   356
-.solnr      equ   360          
-
-       push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 364            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movups xmm3, [sse_six]
-       movups xmm4, [sse_twelve]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three], xmm2
-       movaps [esp + .six],  xmm3
-       movaps [esp + .twelve], xmm4
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movlps xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 8] 
-       movlps [esp + .shX], xmm0
-       movss [esp + .shZ], xmm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       mov   [esp + .solnr],  ebx
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testcoul
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm2
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm3
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdwc
-       jmp   .checksingle_vdwc
-.dopair_vdwc:  
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       xorps  xmm7,xmm7
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       mulps  xmm3, [esp + .iq]
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm3
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdwc:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdwc
-       jmp    .updateouterdata_vdwc
-.dosingle_vdwc:                        
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       movss xmm3, [esi + eax*4]       ; xmm3(0) has the charge        
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       xorps  xmm6, xmm6
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
- 
-       mulps  xmm3, [esp + .iq]
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm3
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdwc:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testcoul
-       jmp .mno_vdwc
-.testcoul:
-       mov  ecx, [esp + .nscoul]
-       cmp  ecx, byte 0
-       jnz  .mno_coul
-       jmp  .testvdw
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear  i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-
-.unroll_coul_loop:     
-       ;; quad-unrolled innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm5, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b          
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       mulps xmm3, xmm5
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       mov    edi, [ebp + %$faction]
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_coul
-       jmp   .checksingle_coul
-.dopair_coul:  
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       mulps  xmm3, [esp + .iq]
-       xorps  xmm7,xmm7
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's 
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-       
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_coul:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_coul
-       jmp    .updateouterdata_coul
-.dosingle_coul:                        
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       movss xmm3, [esi + eax*4]       ; xmm3(0) has the charge        
-       
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
- 
-       mulps  xmm3, [esp + .iq]
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       mov    edi, [ebp + %$faction]
-       movaps xmm5, [esp + .vctot]
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm4, xmm3       ; xmm4=fscal
-       addps  xmm5, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vctot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj     
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-.updateouterdata_coul:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .testvdw
-       jmp .mno_coul
-.testvdw:
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdw
-       jmp   .checksingle_vdw
-.dopair_vdw:   
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       xorps  xmm7,xmm7
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdw:                              
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdw
-       jmp    .updateouterdata_vdw
-.dosingle_vdw:                 
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]                
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       xorps  xmm6, xmm6
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]   
-       
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdw:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-       
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-.last_mno:     
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 364
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl1120_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.iqO         equ   144 
-.iqH         equ   160 
-.dxO         equ   176
-.dyO         equ   192
-.dzO         equ   208 
-.dxH1        equ   224
-.dyH1        equ   240
-.dzH1        equ   256 
-.dxH2        equ   272
-.dyH2        equ   288
-.dzH2        equ   304 
-.qqO         equ   320
-.qqH         equ   336
-.c6          equ   352
-.c12         equ   368
-.six         equ   384
-.twelve      equ   400          
-.vctot       equ   416
-.vnbtot      equ   432
-.fixO        equ   448
-.fiyO        equ   464
-.fizO        equ   480
-.fixH1       equ   496
-.fiyH1       equ   512
-.fizH1       equ   528
-.fixH2       equ   544
-.fiyH2       equ   560
-.fizH2       equ   576
-.fjx        equ   592
-.fjy         equ   608
-.fjz         equ   624
-.half        equ   640
-.three       equ   656
-.is3         equ   672
-.ii3         equ   676
-.ntia       equ   680  
-.innerjjnr   equ   684
-.innerk      equ   688
-.salign             equ   692                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 696            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, [edx + ebx*4 + 4]   
-       movss xmm5, [ebp + %$facel]
-       mulss  xmm3, xmm5
-       mulss  xmm4, xmm5
-
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       movaps [esp + .iqO], xmm3
-       movaps [esp + .iqH], xmm4
-       
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       mov   [esp + .ntia], ecx                
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .odd_inner
-.unroll_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movaps xmm4, xmm3            ;  and in xmm4
-       mulps  xmm3, [esp + .iqO]
-       mulps  xmm4, [esp + .iqH]
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps  [esp + .qqO], xmm3
-       movaps  [esp + .qqH], xmm4
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ixO-izO to xmm4-xmm6
-       movaps xmm4, [esp + .ixO]
-       movaps xmm5, [esp + .iyO]
-       movaps xmm6, [esp + .izO]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxO], xmm4
-       movaps [esp + .dyO], xmm5
-       movaps [esp + .dzO], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       movaps xmm7, xmm4
-       ; rsqO in xmm7
-
-       ; move ixH1-izH1 to xmm4-xmm6
-       movaps xmm4, [esp + .ixH1]
-       movaps xmm5, [esp + .iyH1]
-       movaps xmm6, [esp + .izH1]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxH1], xmm4
-       movaps [esp + .dyH1], xmm5
-       movaps [esp + .dzH1], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm6, xmm5
-       addps xmm6, xmm4
-       ; rsqH1 in xmm6
-
-       ; move ixH2-izH2 to xmm3-xmm5
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-
-       ; calc dr
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-
-       ; store dr
-       movaps [esp + .dxH2], xmm3
-       movaps [esp + .dyH2], xmm4
-       movaps [esp + .dzH2], xmm5
-       ; square it
-       mulps xmm3,xmm3
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       addps xmm5, xmm4
-       addps xmm5, xmm3
-       ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
-       ; start with rsqO - seed in xmm2        
-       rsqrtps xmm2, xmm7
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm7      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm7, xmm4      ; rinvO in xmm7
-       ; rsqH1 - seed in xmm2  
-       rsqrtps xmm2, xmm6
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm6      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm6, xmm4      ; rinvH1 in xmm6
-       ; rsqH2 - seed in xmm2  
-       rsqrtps xmm2, xmm5
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm5      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm5, xmm4      ; rinvH2 in xmm5
-
-       ; do O interactions
-       movaps  xmm4, xmm7      
-       mulps   xmm4, xmm4      ; xmm7=rinv, xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm7, [esp + .qqO]       ;xmm7=vcoul
-       
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm3, xmm2
-       subps  xmm3, xmm1       ; vnb=vnb12-vnb6                
-       addps  xmm3, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       addps  xmm2, xmm7       
-       mulps  xmm4, xmm2       ; total fsO in xmm4
-
-       addps  xmm7, [esp + .vctot]
-       
-       movaps [esp + .vnbtot], xmm3
-       movaps [esp + .vctot], xmm7
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update O forces
-       movaps xmm3, [esp + .fixO]
-       movaps xmm4, [esp + .fiyO]
-       movaps xmm7, [esp + .fizO]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixO], xmm3
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm7
-       ; update j forces with water O
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H1 interactions
-       movaps  xmm4, xmm6      
-       mulps   xmm4, xmm4      ; xmm6=rinv, xmm4=rinvsq
-       mulps  xmm6, [esp + .qqH]       ;xmm6=vcoul
-       mulps  xmm4, xmm6               ; total fsH1 in xmm4
-       
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH1]
-       movaps xmm1, [esp + .dyH1]
-       movaps xmm2, [esp + .dzH1]
-       movaps [esp + .vctot], xmm6
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H1 forces
-       movaps xmm3, [esp + .fixH1]
-       movaps xmm4, [esp + .fiyH1]
-       movaps xmm7, [esp + .fizH1]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH1], xmm3
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm7
-       ; update j forces with water H1
-       addps  xmm0, [esp + .fjx]
-       addps  xmm1, [esp + .fjy]
-       addps  xmm2, [esp + .fjz]
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H2 interactions
-       movaps  xmm4, xmm5      
-       mulps   xmm4, xmm4      ; xmm5=rinv, xmm4=rinvsq
-       mulps  xmm5, [esp + .qqH]       ;xmm5=vcoul
-       mulps  xmm4, xmm5               ; total fsH1 in xmm4
-       
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH2]
-       movaps xmm1, [esp + .dyH2]
-       movaps xmm2, [esp + .dzH2]
-       movaps [esp + .vctot], xmm5
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H2 forces
-       movaps xmm3, [esp + .fixH2]
-       movaps xmm4, [esp + .fiyH2]
-       movaps xmm7, [esp + .fizH2]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH2], xmm3
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm7
-
-       mov edi, [ebp +%$faction]
-       ; update j forces
-       addps xmm0, [esp + .fjx]
-       addps xmm1, [esp + .fjy]
-       addps xmm2, [esp + .fjz]
-
-       movlps xmm4, [edi + eax*4]
-       movlps xmm7, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm7, [edi + edx*4]
-       
-       movaps xmm3, xmm4
-       shufps xmm3, xmm7, 10001000b
-       shufps xmm4, xmm7, 11011101b                          
-       ; xmm3 has fjx, xmm4 has fjy.
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       ; unpack the back for storing.
-       movaps xmm7, xmm3
-       unpcklps xmm7, xmm4
-       unpckhps xmm3, xmm4     
-       movlps [edi + eax*4], xmm7
-       movlps [edi + ecx*4], xmm3
-       movhps [edi + ebx*4], xmm7
-       movhps [edi + edx*4], xmm3
-       ; finally z forces 
-       movss  xmm0, [edi + eax*4 + 8]
-       movss  xmm1, [edi + ebx*4 + 8]
-       movss  xmm3, [edi + ecx*4 + 8]
-       movss  xmm4, [edi + edx*4 + 8]
-       subss  xmm0, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm1, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm3, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm4, xmm2
-       movss  [edi + eax*4 + 8], xmm0
-       movss  [edi + ebx*4 + 8], xmm1
-       movss  [edi + ecx*4 + 8], xmm3
-       movss  [edi + edx*4 + 8], xmm4
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .odd_inner
-       jmp   .unroll_loop
-.odd_inner:    
-       add   [esp + .innerk], dword 4
-       jnz   .odd_loop
-       jmp   .updateouterdata
-.odd_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       xorps xmm4, xmm4
-       movss xmm4, [esp + .iqO]
-       mov esi, [ebp + %$charge] 
-       movhps xmm4, [esp + .iqH]     
-       movss xmm3, [esi + eax*4]       ; charge in xmm3
-       shufps xmm3, xmm3, 0b
-       mulps xmm3, xmm4
-       movaps [esp + .qqO], xmm3       ; use oxygen qq for storage.
-
-       xorps xmm6, xmm6
-       mov esi, [ebp + %$type]
-       mov ebx, [esi + eax*4]
-       mov esi, [ebp + %$nbfp]
-       shl ebx, 1      
-       add ebx, [esp + .ntia]
-       movlps xmm6, [esi + ebx*4]
-       movaps xmm7, xmm6
-       shufps xmm6, xmm6, 11111100b
-       shufps xmm7, xmm7, 11111101b
-       movaps [esp + .c6], xmm6
-       movaps [esp + .c12], xmm7
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-       
-       ; move j coords to xmm0-xmm2
-       movss xmm0, [esi + eax*4]
-       movss xmm1, [esi + eax*4 + 4]
-       movss xmm2, [esi + eax*4 + 8]
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       
-       movss xmm3, [esp + .ixO]
-       movss xmm4, [esp + .iyO]
-       movss xmm5, [esp + .izO]
-               
-       movlps xmm6, [esp + .ixH1]
-       movlps xmm7, [esp + .ixH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm3, xmm6
-       movlps xmm6, [esp + .iyH1]
-       movlps xmm7, [esp + .iyH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm4, xmm6
-       movlps xmm6, [esp + .izH1]
-       movlps xmm7, [esp + .izH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm5, xmm6
-
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       
-       movaps [esp + .dxO], xmm3
-       movaps [esp + .dyO], xmm4
-       movaps [esp + .dzO], xmm5
-
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       ; rsq in xmm4.
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulss  xmm1, xmm4
-       movaps xmm3, [esp + .qqO]
-       mulss  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulss  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm3, xmm0       ; xmm3=vcoul
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subss  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulss  xmm1, [esp + .six]
-       mulss  xmm2, [esp + .twelve]
-       subss  xmm2, xmm1
-       addps  xmm2, xmm3
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm3, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-
-       movaps [esp + .vctot], xmm3
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       movss  xmm3, [esp + .fixO]      
-       movss  xmm4, [esp + .fiyO]      
-       movss  xmm5, [esp + .fizO]      
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esp + .fixO], xmm3      
-       movss  [esp + .fiyO], xmm4      
-       movss  [esp + .fizO], xmm5      ; updated the O force. now do the H's
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       shufps xmm3, xmm3, 11100110b    ; shift right
-       shufps xmm4, xmm4, 11100110b
-       shufps xmm5, xmm5, 11100110b
-       addss  xmm3, [esp + .fixH1]
-       addss  xmm4, [esp + .fiyH1]
-       addss  xmm5, [esp + .fizH1]
-       movss  [esp + .fixH1], xmm3     
-       movss  [esp + .fiyH1], xmm4     
-       movss  [esp + .fizH1], xmm5     ; updated the H1 force. 
-
-       mov edi, [ebp + %$faction]
-       shufps xmm3, xmm3, 11100111b    ; shift right
-       shufps xmm4, xmm4, 11100111b
-       shufps xmm5, xmm5, 11100111b
-       addss  xmm3, [esp + .fixH2]
-       addss  xmm4, [esp + .fiyH2]
-       addss  xmm5, [esp + .fizH2]
-       movss  [esp + .fixH2], xmm3     
-       movss  [esp + .fiyH2], xmm4     
-       movss  [esp + .fizH2], xmm5     ; updated the H2 force. 
-
-       ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
-       xorps  xmm5, xmm5
-       movaps xmm3, xmm0
-       movlps xmm6, [edi + eax*4]
-       movss  xmm7, [edi + eax*4 + 8]
-       unpcklps xmm3, xmm1
-       movlhps  xmm3, xmm5     
-       unpckhps xmm0, xmm1             
-       addps    xmm0, xmm3
-       movhlps  xmm3, xmm0     
-       addps    xmm0, xmm3     ; x,y sum in xmm0
-
-       movhlps  xmm1, xmm2
-       addss    xmm2, xmm1
-       shufps   xmm1, xmm1, 1b 
-       addss    xmm2, xmm1    ; z sum in xmm2
-       subps    xmm6, xmm0
-       subss    xmm7, xmm2
-       
-       movlps [edi + eax*4],     xmm6
-       movss  [edi + eax*4 + 8], xmm7
-
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .odd_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO]
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       mov   edx, [ebp + %$gid]  
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4    
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-        
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 696
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-proc inl1130_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.jxO        equ   144
-.jyO        equ   160
-.jzO         equ   176
-.jxH1       equ   192
-.jyH1       equ   208
-.jzH1        equ   224
-.jxH2       equ   240
-.jyH2       equ   256
-.jzH2        equ   272
-.dxOO        equ   288
-.dyOO        equ   304
-.dzOO        equ   320 
-.dxOH1       equ   336
-.dyOH1       equ   352
-.dzOH1       equ   368 
-.dxOH2       equ   384
-.dyOH2       equ   400
-.dzOH2       equ   416 
-.dxH1O       equ   432
-.dyH1O       equ   448
-.dzH1O       equ   464 
-.dxH1H1      equ   480
-.dyH1H1      equ   496
-.dzH1H1      equ   512 
-.dxH1H2      equ   528
-.dyH1H2      equ   544
-.dzH1H2      equ   560 
-.dxH2O       equ   576
-.dyH2O       equ   592
-.dzH2O       equ   608 
-.dxH2H1      equ   624
-.dyH2H1      equ   640
-.dzH2H1      equ   656 
-.dxH2H2      equ   672
-.dyH2H2      equ   688
-.dzH2H2      equ   704
-.qqOO        equ   720
-.qqOH        equ   736
-.qqHH        equ   752
-.c6          equ   768
-.c12         equ   784
-.six         equ   800
-.twelve      equ   816          
-.vctot       equ   832
-.vnbtot      equ   848
-.fixO        equ   864
-.fiyO        equ   880
-.fizO        equ   896
-.fixH1       equ   912
-.fiyH1       equ   928
-.fizH1       equ   944
-.fixH2       equ   960
-.fiyH2       equ   976
-.fizH2       equ   992
-.fjxO       equ  1008
-.fjyO        equ  1024
-.fjzO        equ  1040
-.fjxH1      equ  1056
-.fjyH1       equ  1072
-.fjzH1       equ  1088
-.fjxH2      equ  1104
-.fjyH2       equ  1120
-.fjzH2       equ  1136
-.half        equ  1152
-.three       equ  1168
-.rsqOO       equ  1184
-.rsqOH1      equ  1200
-.rsqOH2      equ  1216
-.rsqH1O      equ  1232
-.rsqH1H1     equ  1248
-.rsqH1H2     equ  1264
-.rsqH2O      equ  1280
-.rsqH2H1     equ  1296
-.rsqH2H2     equ  1312
-.rinvOO      equ  1328
-.rinvOH1     equ  1344
-.rinvOH2     equ  1360
-.rinvH1O     equ  1376
-.rinvH1H1    equ  1392
-.rinvH1H2    equ  1408
-.rinvH2O     equ  1424
-.rinvH2H1    equ  1440
-.rinvH2H2    equ  1456
-.is3         equ  1472
-.ii3         equ  1476
-.innerjjnr   equ  1480
-.innerk      equ  1484
-.salign             equ  1488                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 1492           ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, xmm3        
-       movss xmm5, [edx + ebx*4 + 4]   
-       movss xmm6, [ebp + %$facel]
-       mulss  xmm3, xmm3
-       mulss  xmm4, xmm5
-       mulss  xmm5, xmm5
-       mulss  xmm3, xmm6
-       mulss  xmm4, xmm6
-       mulss  xmm5, xmm6
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .qqOO], xmm3
-       movaps [esp + .qqOH], xmm4
-       movaps [esp + .qqHH], xmm5
-               
-       xorps xmm0, xmm0
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       mov   edx, ecx
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       add   edx, ecx
-       mov   eax, [ebp + %$nbfp]
-       movlps xmm0, [eax + edx*4] 
-       movaps xmm1, xmm0
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 01010101b
-       movaps [esp + .c6], xmm0
-       movaps [esp + .c12], xmm1
-
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx 
-       
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .single_check
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4] 
-       mov   ecx, [edx + 8]
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-       
-       ; move j coordinates to local temp. variables
-       movlps xmm2, [esi + eax*4]
-       movlps xmm3, [esi + eax*4 + 12]
-       movlps xmm4, [esi + eax*4 + 24]
-
-       movlps xmm5, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 12]
-       movlps xmm7, [esi + ebx*4 + 24]
-
-       movhps xmm2, [esi + ecx*4]
-       movhps xmm3, [esi + ecx*4 + 12]
-       movhps xmm4, [esi + ecx*4 + 24]
-
-       movhps xmm5, [esi + edx*4]
-       movhps xmm6, [esi + edx*4 + 12]
-       movhps xmm7, [esi + edx*4 + 24]
-
-       ;; current state:       
-       ;;  xmm2= jxOa  jyOa  jxOc  jyOc
-       ;;  xmm3= jxH1a jyH1a jxH1c jyH1c
-       ;;  xmm4= jxH2a jyH2a jxH2c jyH2c
-       ;;  xmm5= jxOb  jyOb  jxOd  jyOd
-       ;;  xmm6= jxH1b jyH1b jxH1d jyH1d
-       ;;  xmm7= jxH2b jyH2b jxH2d jyH2d
-       
-       movaps xmm0, xmm2
-       movaps xmm1, xmm3
-       unpcklps xmm0, xmm5     ; xmm0= jxOa  jxOb  jyOa  jyOb
-       unpcklps xmm1, xmm6     ; xmm1= jxH1a jxH1b jyH1a jyH1b
-       unpckhps xmm2, xmm5     ; xmm2= jxOc  jxOd  jyOc  jyOd
-       unpckhps xmm3, xmm6     ; xmm3= jxH1c jxH1d jyH1c jyH1d 
-       movaps xmm5, xmm4
-       movaps   xmm6, xmm0
-       unpcklps xmm4, xmm7     ; xmm4= jxH2a jxH2b jyH2a jyH2b         
-       unpckhps xmm5, xmm7     ; xmm5= jxH2c jxH2d jyH2c jyH2d
-       movaps   xmm7, xmm1
-       movlhps  xmm0, xmm2     ; xmm0= jxOa  jxOb  jxOc  jxOd 
-       movaps [esp + .jxO], xmm0
-       movhlps  xmm2, xmm6     ; xmm2= jyOa  jyOb  jyOc  jyOd
-       movaps [esp + .jyO], xmm2
-       movlhps  xmm1, xmm3
-       movaps [esp + .jxH1], xmm1
-       movhlps  xmm3, xmm7
-       movaps   xmm6, xmm4
-       movaps [esp + .jyH1], xmm3
-       movlhps  xmm4, xmm5
-       movaps [esp + .jxH2], xmm4
-       movhlps  xmm5, xmm6
-       movaps [esp + .jyH2], xmm5
-
-       movss  xmm0, [esi + eax*4 + 8]
-       movss  xmm1, [esi + eax*4 + 20]
-       movss  xmm2, [esi + eax*4 + 32]
-
-       movss  xmm3, [esi + ecx*4 + 8]
-       movss  xmm4, [esi + ecx*4 + 20]
-       movss  xmm5, [esi + ecx*4 + 32]
-
-       movhps xmm0, [esi + ebx*4 + 4]
-       movhps xmm1, [esi + ebx*4 + 16]
-       movhps xmm2, [esi + ebx*4 + 28]
-       
-       movhps xmm3, [esi + edx*4 + 4]
-       movhps xmm4, [esi + edx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 28]
-       
-       shufps xmm0, xmm3, 11001100b
-       shufps xmm1, xmm4, 11001100b
-       shufps xmm2, xmm5, 11001100b
-       movaps [esp + .jzO],  xmm0
-       movaps [esp + .jzH1],  xmm1
-       movaps [esp + .jzH2],  xmm2
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixO]
-       movaps xmm4, [esp + .iyO]
-       movaps xmm5, [esp + .izO]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxOH1], xmm3
-       movaps [esp + .dyOH1], xmm4
-       movaps [esp + .dzOH1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOO], xmm0
-       movaps [esp + .rsqOH1], xmm3
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       subps  xmm3, [esp + .jxO]
-       subps  xmm4, [esp + .jyO]
-       subps  xmm5, [esp + .jzO]
-       movaps [esp + .dxOH2], xmm0
-       movaps [esp + .dyOH2], xmm1
-       movaps [esp + .dzOH2], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1O], xmm3
-       movaps [esp + .dyH1O], xmm4
-       movaps [esp + .dzH1O], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOH2], xmm0
-       movaps [esp + .rsqH1O], xmm3
-
-       movaps xmm0, [esp + .ixH1]
-       movaps xmm1, [esp + .iyH1]
-       movaps xmm2, [esp + .izH1]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH1]
-       subps  xmm1, [esp + .jyH1]
-       subps  xmm2, [esp + .jzH1]
-       subps  xmm3, [esp + .jxH2]
-       subps  xmm4, [esp + .jyH2]
-       subps  xmm5, [esp + .jzH2]
-       movaps [esp + .dxH1H1], xmm0
-       movaps [esp + .dyH1H1], xmm1
-       movaps [esp + .dzH1H1], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1H2], xmm3
-       movaps [esp + .dyH1H2], xmm4
-       movaps [esp + .dzH1H2], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqH1H1], xmm0
-       movaps [esp + .rsqH1H2], xmm3
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxH2O], xmm0
-       movaps [esp + .dyH2O], xmm1
-       movaps [esp + .dzH2O], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH2H1], xmm3
-       movaps [esp + .dyH2H1], xmm4
-       movaps [esp + .dzH2H1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       movaps [esp + .rsqH2O], xmm0
-       movaps [esp + .rsqH2H1], xmm4
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       movaps [esp + .dxH2H2], xmm0
-       movaps [esp + .dyH2H2], xmm1
-       movaps [esp + .dzH2H2], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2
-       movaps [esp + .rsqH2H2], xmm0
-               
-       ; start doing invsqrt. use rsq values in xmm0, xmm4
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinvH2H2
-       mulps   xmm7, [esp + .half] ; rinvH2H1
-       movaps  [esp + .rinvH2H2], xmm3
-       movaps  [esp + .rinvH2H1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOO]
-       rsqrtps xmm5, [esp + .rsqOH1]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOO]
-       mulps   xmm5, [esp + .rsqOH1]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOO], xmm3
-       movaps  [esp + .rinvOH1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOH2]
-       rsqrtps xmm5, [esp + .rsqH1O]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOH2]
-       mulps   xmm5, [esp + .rsqH1O]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOH2], xmm3
-       movaps  [esp + .rinvH1O], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH1H1]
-       rsqrtps xmm5, [esp + .rsqH1H2]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqH1H1]
-       mulps   xmm5, [esp + .rsqH1H2]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvH1H1], xmm3
-       movaps  [esp + .rinvH1H2], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH2O]
-       movaps  xmm2, xmm1
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, [esp + .rsqH2O]
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2
-       mulps   xmm3, [esp + .half] 
-       movaps  [esp + .rinvH2O], xmm3
-
-       ;; start with OO interaction.
-       movaps xmm0, [esp + .rinvOO]
-       movaps xmm7, xmm0
-       mulps  xmm0, xmm0
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ; xmm1=rinvsix
-       mulps  xmm7, [esp + .qqOO]
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=rinvtwelve
-       mulps  xmm1, [esp + .c6]        
-       mulps  xmm2, [esp + .c12]       
-       movaps xmm3, xmm2
-       subps  xmm3, xmm1       ; xmm3=vnb12-vnb6
-       addps  xmm3, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       movaps [esp + .vnbtot], xmm3
-       subps  xmm2, xmm1
-       addps  xmm2, xmm7
-       addps  xmm7, [esp + .vctot]
-       mulps  xmm0, xmm2       
- 
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOO]
-       mulps xmm1, [esp + .dyOO]
-       mulps xmm2, [esp + .dzOO]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H1 interaction
-       movaps xmm0, [esp + .rinvOH1]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsOH1 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH1]
-       mulps xmm1, [esp + .dyOH1]
-       mulps xmm2, [esp + .dzOH1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H2 interaction
-       movaps xmm0, [esp + .rinvOH2]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsOH2 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH2]
-       mulps xmm1, [esp + .dyOH2]
-       mulps xmm2, [esp + .dzOH2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; H1-O interaction
-       movaps xmm0, [esp + .rinvH1O]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsH1O 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH1O]
-       mulps xmm1, [esp + .dyH1O]
-       mulps xmm2, [esp + .dzH1O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H1 interaction
-       movaps xmm0, [esp + .rinvH1H1]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsH1H1 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH1H1]
-       mulps xmm1, [esp + .dyH1H1]
-       mulps xmm2, [esp + .dzH1H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H2 interaction
-       movaps xmm0, [esp + .rinvH1H2]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsOH2 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH1H2]
-       mulps xmm1, [esp + .dyH1H2]
-       mulps xmm2, [esp + .dzH1H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H2-O interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqOH]
-       mulps xmm0, xmm1        ; fsH2O 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH2O]
-       mulps xmm1, [esp + .dyH2O]
-       mulps xmm2, [esp + .dzH2O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H1 interaction
-       movaps xmm0, [esp + .rinvH2H1]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsH2H1 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH2H1]
-       mulps xmm1, [esp + .dyH2H1]
-       mulps xmm2, [esp + .dzH2H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H2 interaction
-       movaps xmm0, [esp + .rinvH2H2]
-       movaps xmm1, xmm0
-       mulps xmm0, xmm0
-       mulps xmm1, [esp + .qqHH]
-       mulps xmm0, xmm1        ; fsH2H2 
-       addps xmm7, xmm1        ; add to local vctot.
-       movaps xmm1, xmm0
-       movaps [esp + .vctot], xmm7
-       movaps xmm2, xmm0
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH2H2]
-       mulps xmm1, [esp + .dyH2H2]
-       mulps xmm2, [esp + .dzH2H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       mov edi, [ebp +%$faction]
-               
-       ; Did all interactions - now update j forces.
-       ; 4 j waters with three atoms each - first do a & b j particles
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd 
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpcklps xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjxOb  fjyOb
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOb  fjyOb
-       unpcklps xmm1, xmm2        ; xmm1= fjzOa  fjxH1a fjzOb  fjxH1b
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpcklps xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1b fjzH1b
-       unpcklps xmm5, xmm6        ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
-       movlhps  xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjzOa  fjxH1a  
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOb  fjyOb  fjzOb  fjxH1b
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
-       movups   xmm1, [edi + eax*4]
-       movups   xmm2, [edi + eax*4 + 16]
-       movups   xmm5, [edi + ebx*4]
-       movups   xmm6, [edi + ebx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + eax*4 + 32]
-       movss    xmm3, [edi + ebx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm7, xmm7, 1b
-       
-       movups   [edi + eax*4],     xmm1
-       movups   [edi + eax*4 + 16],xmm2
-       movups   [edi + ebx*4],     xmm5
-       movups   [edi + ebx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + eax*4 + 32], xmm0
-       movss    [edi + ebx*4 + 32], xmm3       
-
-       ;; then do the second pair (c & d)
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpckhps xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjxOd  fjyOd
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOd  fjyOd
-       unpckhps xmm1, xmm2        ; xmm1= fjzOc  fjxH1c fjzOd  fjxH1d
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpckhps xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d  
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1d fjzH1d         
-       unpckhps xmm5, xmm6        ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
-       movlhps  xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjzOc  fjxH1c 
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOd  fjyOd  fjzOd  fjxH1d
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c 
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
-       movups   xmm1, [edi + ecx*4]
-       movups   xmm2, [edi + ecx*4 + 16]
-       movups   xmm5, [edi + edx*4]
-       movups   xmm6, [edi + edx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + ecx*4 + 32]
-       movss    xmm3, [edi + edx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm4, xmm4, 10b
-       shufps   xmm7, xmm7, 11b
-       movups   [edi + ecx*4],     xmm1
-       movups   [edi + ecx*4 + 16],xmm2
-       movups   [edi + edx*4],     xmm5
-       movups   [edi + edx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + ecx*4 + 32], xmm0
-       movss    [edi + edx*4 + 32], xmm3       
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .single_check
-       jmp   .unroll_loop
-.single_check:
-       add   [esp + .innerk], dword 4
-       jnz   .single_loop
-       jmp   .updateouterdata
-.single_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-
-       ; fetch j coordinates
-       xorps xmm3, xmm3
-       xorps xmm4, xmm4
-       xorps xmm5, xmm5
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + eax*4 + 4]
-       movss xmm5, [esi + eax*4 + 8]
-
-       movlps xmm6, [esi + eax*4 + 12]
-       movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
-       ;;  fetch both z coords in one go, to positions 0 and 3 in xmm7
-       movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
-       shufps xmm6, xmm6, 11011000b    ;  xmm6=jxH1 jxH2 jyH1 jyH2
-       movlhps xmm3, xmm6              ; xmm3= jxO   0  jxH1 jxH2 
-       movaps  xmm0, [esp + .ixO]     
-       movaps  xmm1, [esp + .iyO]
-       movaps  xmm2, [esp + .izO]      
-       shufps  xmm4, xmm6, 11100100b ;  xmm4= jyO   0   jyH1 jyH2
-       shufps xmm5, xmm7, 11000100b  ;  xmm5= jzO   0   jzH1 jzH2  
-       ;;  store all j coordinates in jO
-       movaps [esp + .jxO], xmm3
-       movaps [esp + .jyO], xmm4
-       movaps [esp + .jzO], xmm5
-       subps  xmm0, xmm3
-       subps  xmm1, xmm4
-       subps  xmm2, xmm5
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2        ;  have rsq in xmm0.
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       movaps  xmm2, xmm1      
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, xmm0
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2                                                      
-       mulps   xmm3, [esp + .half] ; rinv iO - j water
-
-       xorps   xmm1, xmm1
-       movaps  xmm0, xmm3
-       xorps   xmm4, xmm4
-       mulps   xmm0, xmm0      ; xmm0=rinvsq
-       ;; fetch charges to xmm4 (temporary)
-       movss   xmm4, [esp + .qqOO]
-       movss   xmm1, xmm0
-       movhps  xmm4, [esp + .qqOH]
-       mulss   xmm1, xmm0
-       mulps   xmm3, xmm4      ; xmm3=vcoul
-       mulss   xmm1, xmm0      ;  xmm1(0)=rinvsix
-       movaps  xmm2, xmm1      ;  zero everything else in xmm2
-       mulss   xmm2, xmm2      ;  xmm2=rinvtwelve
-
-       mulss   xmm1, [esp + .c6]
-       mulss   xmm2, [esp + .c12]
-       movaps  xmm4, xmm2
-       subss   xmm4, xmm1      ;  vnbtot=vnb12-vnb6
-       addps   xmm4, [esp + .vnbtot]
-       mulss   xmm1, [esp + .six]
-       mulss   xmm2, [esp + .twelve]   
-       movaps  [esp + .vnbtot], xmm4
-       subss   xmm2, xmm1      ; fsD+fsR
-       addps   xmm2, xmm3      ; fsC+fsD+fsR
-
-       addps   xmm3, [esp + .vctot]
-       mulps   xmm0, xmm2      ;  total fscal
-       movaps  [esp + .vctot], xmm3    
-
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxOO]
-       mulps   xmm1, [esp + .dyOO]
-       mulps   xmm2, [esp + .dzOO]
-       ;; initial update for j forces
-       xorps   xmm3, xmm3
-       xorps   xmm4, xmm4
-       xorps   xmm5, xmm5
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixO]
-       addps   xmm1, [esp + .fiyO]
-       addps   xmm2, [esp + .fizO]
-       movaps  [esp + .fixO], xmm0
-       movaps  [esp + .fiyO], xmm1
-       movaps  [esp + .fizO], xmm2
-
-       
-       ;;  done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
-       movaps  xmm0, [esp + .ixH1]
-       movaps  xmm1, [esp + .iyH1]
-       movaps  xmm2, [esp + .izH1]     
-       movaps  xmm3, [esp + .ixH2] 
-       movaps  xmm4, [esp + .iyH2] 
-       movaps  xmm5, [esp + .izH2] 
-       subps   xmm0, [esp + .jxO]
-       subps   xmm1, [esp + .jyO]
-       subps   xmm2, [esp + .jzO]
-       subps   xmm3, [esp + .jxO]
-       subps   xmm4, [esp + .jyO]
-       subps   xmm5, [esp + .jzO]
-       movaps [esp + .dxH1O], xmm0
-       movaps [esp + .dyH1O], xmm1
-       movaps [esp + .dzH1O], xmm2
-       movaps [esp + .dxH2O], xmm3
-       movaps [esp + .dyH2O], xmm4
-       movaps [esp + .dzH2O], xmm5
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       mulps xmm3, xmm3
-       mulps xmm4, xmm4
-       mulps xmm5, xmm5
-       addps xmm0, xmm1
-       addps xmm4, xmm3
-       addps xmm0, xmm2        ;  have rsqH1 in xmm0.
-       addps xmm4, xmm5        ;  have rsqH2 in xmm4.
-
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinv H1 - j water
-       mulps   xmm7, [esp + .half] ; rinv H2 - j water
-
-       ;; assemble charges in xmm6
-       xorps   xmm6, xmm6
-       ; do coulomb interaction
-       movaps  xmm0, xmm3
-       movss   xmm6, [esp + .qqOH]
-       movaps  xmm4, xmm7
-       movhps  xmm6, [esp + .qqHH]
-       mulps   xmm0, xmm0      ;  rinvsq
-       mulps   xmm4, xmm4      ;  rinvsq
-       mulps   xmm3, xmm6      ;  vcoul
-       mulps   xmm7, xmm6      ;  vcoul
-       movaps  xmm2, xmm3
-       addps   xmm2, xmm7      ;  total vcoul
-       mulps   xmm0, xmm3      ;  fscal
-       
-       addps   xmm2, [esp + .vctot]
-       mulps   xmm7, xmm4      ;  fscal
-       movaps  [esp + .vctot], xmm2
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxH1O]
-       mulps   xmm1, [esp + .dyH1O]
-       mulps   xmm2, [esp + .dzH1O]
-       ;;  update forces H1 - j water
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH1]
-       addps   xmm1, [esp + .fiyH1]
-       addps   xmm2, [esp + .fizH1]
-       movaps  [esp + .fixH1], xmm0
-       movaps  [esp + .fiyH1], xmm1
-       movaps  [esp + .fizH1], xmm2
-       ;; do forces H2 - j water
-       movaps xmm0, xmm7
-       movaps xmm1, xmm7
-       movaps xmm2, xmm7
-       mulps   xmm0, [esp + .dxH2O]
-       mulps   xmm1, [esp + .dyH2O]
-       mulps   xmm2, [esp + .dzH2O]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       mov     esi, [ebp + %$faction]
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH2]
-       addps   xmm1, [esp + .fiyH2]
-       addps   xmm2, [esp + .fizH2]
-       movaps  [esp + .fixH2], xmm0
-       movaps  [esp + .fiyH2], xmm1
-       movaps  [esp + .fizH2], xmm2
-
-       ;; update j water forces from local variables
-       movlps  xmm0, [esi + eax*4]
-       movlps  xmm1, [esi + eax*4 + 12]
-       movhps  xmm1, [esi + eax*4 + 24]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       movaps  xmm6, xmm5
-       movaps  xmm7, xmm5
-       shufps  xmm6, xmm6, 10b
-       shufps  xmm7, xmm7, 11b
-       addss   xmm5, [esi + eax*4 + 8]
-       addss   xmm6, [esi + eax*4 + 20]
-       addss   xmm7, [esi + eax*4 + 32]
-       movss   [esi + eax*4 + 8], xmm5
-       movss   [esi + eax*4 + 20], xmm6
-       movss   [esi + eax*4 + 32], xmm7
-       movaps   xmm5, xmm3
-       unpcklps xmm3, xmm4
-       unpckhps xmm5, xmm4
-       addps    xmm0, xmm3
-       addps    xmm1, xmm5
-       movlps  [esi + eax*4], xmm0 
-       movlps  [esi + eax*4 + 12], xmm1 
-       movhps  [esi + eax*4 + 24], xmm1 
-       
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .single_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO] 
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 1492
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl2120_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$krf          arg     
-%$crf          arg     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.iqO         equ   144 
-.iqH         equ   160 
-.dxO         equ   176
-.dyO         equ   192
-.dzO         equ   208 
-.dxH1        equ   224
-.dyH1        equ   240
-.dzH1        equ   256 
-.dxH2        equ   272
-.dyH2        equ   288
-.dzH2        equ   304 
-.qqO         equ   320
-.qqH         equ   336
-.c6          equ   352
-.c12         equ   368
-.six         equ   384
-.twelve      equ   400          
-.vctot       equ   416
-.vnbtot      equ   432
-.fixO        equ   448
-.fiyO        equ   464
-.fizO        equ   480
-.fixH1       equ   496
-.fiyH1       equ   512
-.fizH1       equ   528
-.fixH2       equ   544
-.fiyH2       equ   560
-.fizH2       equ   576
-.fjx        equ   592
-.fjy         equ   608
-.fjz         equ   624
-.half        equ   640
-.three       equ   656
-.two        equ   672
-.krf        equ   688
-.crf        equ   704
-.krsqO       equ   720
-.krsqH1      equ   736
-.krsqH2             equ   752                  
-.is3         equ   768
-.ii3         equ   772
-.ntia       equ   776  
-.innerjjnr   equ   780
-.innerk      equ   784
-.salign             equ   788                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 792            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movups xmm4, [sse_two]
-       movss xmm5, [ebp + %$krf]
-       movss xmm6, [ebp + %$crf]
-
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-       movaps [esp + .two], xmm4
-       shufps xmm5, xmm5, 0b
-       shufps xmm6, xmm6, 0b
-       movaps [esp + .krf], xmm5
-       movaps [esp + .crf], xmm6
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, [edx + ebx*4 + 4]   
-       movss xmm5, [ebp + %$facel]
-       mulss  xmm3, xmm5
-       mulss  xmm4, xmm5
-
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       movaps [esp + .iqO], xmm3
-       movaps [esp + .iqH], xmm4
-       
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       mov   [esp + .ntia], ecx                
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .odd_inner
-.unroll_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movaps xmm4, xmm3            ;  and in xmm4
-       mulps  xmm3, [esp + .iqO]
-       mulps  xmm4, [esp + .iqH]
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps  [esp + .qqO], xmm3
-       movaps  [esp + .qqH], xmm4
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ixO-izO to xmm4-xmm6
-       movaps xmm4, [esp + .ixO]
-       movaps xmm5, [esp + .iyO]
-       movaps xmm6, [esp + .izO]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxO], xmm4
-       movaps [esp + .dyO], xmm5
-       movaps [esp + .dzO], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       movaps xmm7, xmm4
-       ; rsqO in xmm7
-
-       ; move ixH1-izH1 to xmm4-xmm6
-       movaps xmm4, [esp + .ixH1]
-       movaps xmm5, [esp + .iyH1]
-       movaps xmm6, [esp + .izH1]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxH1], xmm4
-       movaps [esp + .dyH1], xmm5
-       movaps [esp + .dzH1], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm6, xmm5
-       addps xmm6, xmm4
-       ; rsqH1 in xmm6
-
-       ; move ixH2-izH2 to xmm3-xmm5
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-
-       ; calc dr
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-
-       ; store dr
-       movaps [esp + .dxH2], xmm3
-       movaps [esp + .dyH2], xmm4
-       movaps [esp + .dzH2], xmm5
-       ; square it
-       mulps xmm3,xmm3
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       addps xmm5, xmm4
-       addps xmm5, xmm3
-       ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
-       movaps xmm0, xmm5
-       movaps xmm1, xmm6
-       movaps xmm2, xmm7
-
-       mulps  xmm0, [esp + .krf]       
-       mulps  xmm1, [esp + .krf]       
-       mulps  xmm2, [esp + .krf]       
-
-       movaps [esp + .krsqH2], xmm0
-       movaps [esp + .krsqH1], xmm1
-       movaps [esp + .krsqO], xmm2
-       
-       ; start with rsqO - seed in xmm2        
-       rsqrtps xmm2, xmm7
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm7      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm7, xmm4      ; rinvO in xmm7
-       ; rsqH1 - seed in xmm2  
-       rsqrtps xmm2, xmm6
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm6      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm6, xmm4      ; rinvH1 in xmm6
-       ; rsqH2 - seed in xmm2  
-       rsqrtps xmm2, xmm5
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm5      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm5, xmm4      ; rinvH2 in xmm5
-
-       ; do O interactions
-       movaps  xmm4, xmm7      
-       mulps   xmm4, xmm4      ; xmm7=rinv, xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm3, xmm2
-       subps  xmm3, xmm1       ; vnb=vnb12-vnb6                
-       addps  xmm3, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1       ;  nb part of fs
-
-       movaps xmm0, xmm7
-       movaps xmm1, [esp + .krsqO]
-       addps  xmm0, xmm1
-       mulps  xmm1, [esp + .two]
-       subps  xmm0, [esp + .crf] ;  xmm0=rinv+krsq-crf
-       subps  xmm7, xmm1
-       mulps  xmm0, [esp + .qqO]
-       mulps  xmm7, [esp + .qqO]
-       addps  xmm2, xmm7
-
-       mulps  xmm4, xmm2       ; total fsO in xmm4
-
-       addps  xmm0, [esp + .vctot]
-       movaps [esp + .vnbtot], xmm3
-       movaps [esp + .vctot], xmm0
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update O forces
-       movaps xmm3, [esp + .fixO]
-       movaps xmm4, [esp + .fiyO]
-       movaps xmm7, [esp + .fizO]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixO], xmm3
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm7
-       ; update j forces with water O
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H1 interactions
-       movaps  xmm4, xmm6      
-       mulps   xmm4, xmm4      ; xmm6=rinv, xmm4=rinvsq
-       movaps  xmm7, xmm6
-       movaps  xmm0, [esp + .krsqH1]
-       addps   xmm6, xmm0      ; xmm6=rinv+krsq
-       mulps   xmm0, [esp + .two]
-       subps   xmm6, [esp + .crf]
-       subps   xmm7, xmm0      ; xmm7=rinv-2*krsq
-       mulps   xmm6, [esp + .qqH] ;  vcoul
-       mulps   xmm7, [esp + .qqH]
-       mulps  xmm4, xmm7               ; total fsH1 in xmm4
-       
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH1]
-       movaps xmm1, [esp + .dyH1]
-       movaps xmm2, [esp + .dzH1]
-       movaps [esp + .vctot], xmm6
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H1 forces
-       movaps xmm3, [esp + .fixH1]
-       movaps xmm4, [esp + .fiyH1]
-       movaps xmm7, [esp + .fizH1]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH1], xmm3
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm7
-       ; update j forces with water H1
-       addps  xmm0, [esp + .fjx]
-       addps  xmm1, [esp + .fjy]
-       addps  xmm2, [esp + .fjz]
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H2 interactions
-       movaps  xmm4, xmm5      
-       mulps   xmm4, xmm4      ; xmm5=rinv, xmm4=rinvsq
-       movaps  xmm7, xmm5
-       movaps  xmm0, [esp + .krsqH2]
-       addps   xmm5, xmm0      ; xmm5=rinv+krsq
-       mulps   xmm0, [esp + .two]
-       subps   xmm5, [esp + .crf]
-       subps   xmm7, xmm0      ; xmm7=rinv-2*krsq
-       mulps   xmm5, [esp + .qqH] ;  vcoul
-       mulps   xmm7, [esp + .qqH]
-       mulps  xmm4, xmm7               ; total fsH2 in xmm4
-       
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH2]
-       movaps xmm1, [esp + .dyH2]
-       movaps xmm2, [esp + .dzH2]
-       movaps [esp + .vctot], xmm5
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H2 forces
-       movaps xmm3, [esp + .fixH2]
-       movaps xmm4, [esp + .fiyH2]
-       movaps xmm7, [esp + .fizH2]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH2], xmm3
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm7
-
-       mov edi, [ebp +%$faction]
-       ; update j forces
-       addps xmm0, [esp + .fjx]
-       addps xmm1, [esp + .fjy]
-       addps xmm2, [esp + .fjz]
-
-       movlps xmm4, [edi + eax*4]
-       movlps xmm7, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm7, [edi + edx*4]
-       
-       movaps xmm3, xmm4
-       shufps xmm3, xmm7, 10001000b
-       shufps xmm4, xmm7, 11011101b                          
-       ; xmm3 has fjx, xmm4 has fjy.
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       ; unpack the back for storing.
-       movaps xmm7, xmm3
-       unpcklps xmm7, xmm4
-       unpckhps xmm3, xmm4     
-       movlps [edi + eax*4], xmm7
-       movlps [edi + ecx*4], xmm3
-       movhps [edi + ebx*4], xmm7
-       movhps [edi + edx*4], xmm3
-       ; finally z forces 
-       movss  xmm0, [edi + eax*4 + 8]
-       movss  xmm1, [edi + ebx*4 + 8]
-       movss  xmm3, [edi + ecx*4 + 8]
-       movss  xmm4, [edi + edx*4 + 8]
-       subss  xmm0, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm1, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm3, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm4, xmm2
-       movss  [edi + eax*4 + 8], xmm0
-       movss  [edi + ebx*4 + 8], xmm1
-       movss  [edi + ecx*4 + 8], xmm3
-       movss  [edi + edx*4 + 8], xmm4
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .odd_inner
-       jmp   .unroll_loop
-.odd_inner:    
-       add   [esp + .innerk], dword 4
-       jnz   .odd_loop
-       jmp   .updateouterdata
-.odd_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       xorps xmm4, xmm4
-       movss xmm4, [esp + .iqO]
-       mov esi, [ebp + %$charge] 
-       movhps xmm4, [esp + .iqH]     
-       movss xmm3, [esi + eax*4]       ; charge in xmm3
-       shufps xmm3, xmm3, 0b
-       mulps xmm3, xmm4
-       movaps [esp + .qqO], xmm3       ; use oxygen qq for storage.
-
-       xorps xmm6, xmm6
-       mov esi, [ebp + %$type]
-       mov ebx, [esi + eax*4]
-       mov esi, [ebp + %$nbfp]
-       shl ebx, 1      
-       add ebx, [esp + .ntia]
-       movlps xmm6, [esi + ebx*4]
-       movaps xmm7, xmm6
-       shufps xmm6, xmm6, 11111100b
-       shufps xmm7, xmm7, 11111101b
-       movaps [esp + .c6], xmm6
-       movaps [esp + .c12], xmm7
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-       
-       ; move j coords to xmm0-xmm2
-       movss xmm0, [esi + eax*4]
-       movss xmm1, [esi + eax*4 + 4]
-       movss xmm2, [esi + eax*4 + 8]
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       
-       movss xmm3, [esp + .ixO]
-       movss xmm4, [esp + .iyO]
-       movss xmm5, [esp + .izO]
-               
-       movlps xmm6, [esp + .ixH1]
-       movlps xmm7, [esp + .ixH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm3, xmm6
-       movlps xmm6, [esp + .iyH1]
-       movlps xmm7, [esp + .iyH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm4, xmm6
-       movlps xmm6, [esp + .izH1]
-       movlps xmm7, [esp + .izH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm5, xmm6
-
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       
-       movaps [esp + .dxO], xmm3
-       movaps [esp + .dyO], xmm4
-       movaps [esp + .dzO], xmm5
-
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       ; rsq in xmm4.
-
-       movaps xmm0, xmm4
-       mulps xmm0, [esp + .krf]
-       movaps [esp + .krsqO], xmm0
-       
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-       movaps xmm1, xmm4
-       mulss  xmm1, xmm4
-       mulss  xmm1, xmm4       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulss  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subss  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulss  xmm1, [esp + .six]
-       mulss  xmm2, [esp + .twelve]
-       subss  xmm2, xmm1
-
-       movaps xmm1, xmm0       ; xmm1=rinv
-       movaps xmm3, [esp + .krsqO]
-       addps  xmm0, xmm3       ; xmm0=rinv+krsq
-       mulps  xmm3, [esp + .two]
-       subps  xmm0, [esp + .crf] ;  xmm0=rinv+krsq-crf
-       subps  xmm1, xmm3       ; xmm1=rinv-2*krsq
-       mulps  xmm0, [esp + .qqO]       ; xmm0=vcoul
-       mulps  xmm1, [esp + .qqO]       ; xmm1=coul part of fs
-
-       addps xmm2, xmm1        ;  total fs
-       
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       addps  xmm0, [esp + .vctot]
-       movaps [esp + .vctot], xmm0
-       
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       movss  xmm3, [esp + .fixO]      
-       movss  xmm4, [esp + .fiyO]      
-       movss  xmm5, [esp + .fizO]      
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esp + .fixO], xmm3      
-       movss  [esp + .fiyO], xmm4      
-       movss  [esp + .fizO], xmm5      ; updated the O force. now do the H's
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       shufps xmm3, xmm3, 11100110b    ; shift right
-       shufps xmm4, xmm4, 11100110b
-       shufps xmm5, xmm5, 11100110b
-       addss  xmm3, [esp + .fixH1]
-       addss  xmm4, [esp + .fiyH1]
-       addss  xmm5, [esp + .fizH1]
-       movss  [esp + .fixH1], xmm3     
-       movss  [esp + .fiyH1], xmm4     
-       movss  [esp + .fizH1], xmm5     ; updated the H1 force. 
-
-       mov edi, [ebp + %$faction]
-       shufps xmm3, xmm3, 11100111b    ; shift right
-       shufps xmm4, xmm4, 11100111b
-       shufps xmm5, xmm5, 11100111b
-       addss  xmm3, [esp + .fixH2]
-       addss  xmm4, [esp + .fiyH2]
-       addss  xmm5, [esp + .fizH2]
-       movss  [esp + .fixH2], xmm3     
-       movss  [esp + .fiyH2], xmm4     
-       movss  [esp + .fizH2], xmm5     ; updated the H2 force. 
-
-       ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
-       xorps  xmm5, xmm5
-       movaps xmm3, xmm0
-       movlps xmm6, [edi + eax*4]
-       movss  xmm7, [edi + eax*4 + 8]
-       unpcklps xmm3, xmm1
-       movlhps  xmm3, xmm5     
-       unpckhps xmm0, xmm1             
-       addps    xmm0, xmm3
-       movhlps  xmm3, xmm0     
-       addps    xmm0, xmm3     ; x,y sum in xmm0
-
-       movhlps  xmm1, xmm2
-       addss    xmm2, xmm1
-       shufps   xmm1, xmm1, 1b 
-       addss    xmm2, xmm1    ; z sum in xmm2
-       subps    xmm6, xmm0
-       subss    xmm7, xmm2
-       
-       movlps [edi + eax*4],     xmm6
-       movss  [edi + eax*4 + 8], xmm7
-
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .odd_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO]
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       mov   edx, [ebp + %$gid]  
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4    
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-        
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 792
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-proc inl2130_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$krf          arg
-%$crf          arg
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.jxO        equ   144
-.jyO        equ   160
-.jzO         equ   176
-.jxH1       equ   192
-.jyH1       equ   208
-.jzH1        equ   224
-.jxH2       equ   240
-.jyH2       equ   256
-.jzH2        equ   272
-.dxOO        equ   288
-.dyOO        equ   304
-.dzOO        equ   320 
-.dxOH1       equ   336
-.dyOH1       equ   352
-.dzOH1       equ   368 
-.dxOH2       equ   384
-.dyOH2       equ   400
-.dzOH2       equ   416 
-.dxH1O       equ   432
-.dyH1O       equ   448
-.dzH1O       equ   464 
-.dxH1H1      equ   480
-.dyH1H1      equ   496
-.dzH1H1      equ   512 
-.dxH1H2      equ   528
-.dyH1H2      equ   544
-.dzH1H2      equ   560 
-.dxH2O       equ   576
-.dyH2O       equ   592
-.dzH2O       equ   608 
-.dxH2H1      equ   624
-.dyH2H1      equ   640
-.dzH2H1      equ   656 
-.dxH2H2      equ   672
-.dyH2H2      equ   688
-.dzH2H2      equ   704
-.qqOO        equ   720
-.qqOH        equ   736
-.qqHH        equ   752
-.c6          equ   768
-.c12         equ   784
-.six         equ   800
-.twelve      equ   816          
-.vctot       equ   832
-.vnbtot      equ   848
-.fixO        equ   864
-.fiyO        equ   880
-.fizO        equ   896
-.fixH1       equ   912
-.fiyH1       equ   928
-.fizH1       equ   944
-.fixH2       equ   960
-.fiyH2       equ   976
-.fizH2       equ   992
-.fjxO       equ  1008
-.fjyO        equ  1024
-.fjzO        equ  1040
-.fjxH1      equ  1056
-.fjyH1       equ  1072
-.fjzH1       equ  1088
-.fjxH2      equ  1104
-.fjyH2       equ  1120
-.fjzH2       equ  1136
-.half        equ  1152
-.three       equ  1168
-.rsqOO       equ  1184
-.rsqOH1      equ  1200
-.rsqOH2      equ  1216
-.rsqH1O      equ  1232
-.rsqH1H1     equ  1248
-.rsqH1H2     equ  1264
-.rsqH2O      equ  1280
-.rsqH2H1     equ  1296
-.rsqH2H2     equ  1312
-.rinvOO      equ  1328
-.rinvOH1     equ  1344
-.rinvOH2     equ  1360
-.rinvH1O     equ  1376
-.rinvH1H1    equ  1392
-.rinvH1H2    equ  1408
-.rinvH2O     equ  1424
-.rinvH2H1    equ  1440
-.rinvH2H2    equ  1456
-.two         equ  1472
-.krf        equ  1488  
-.crf        equ  1504
-.is3         equ  1520
-.ii3         equ  1524
-.innerjjnr   equ  1528
-.innerk      equ  1532
-.salign             equ  1536                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 1540           ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm2, [sse_six]
-       movups xmm3, [sse_twelve]
-       movups xmm4, [sse_two]
-       movss xmm5, [ebp + %$krf]
-       movss xmm6, [ebp + %$crf]
-       
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .six],  xmm2
-       movaps [esp + .twelve], xmm3
-       movaps [esp + .two], xmm4
-       shufps xmm5, xmm5, 0b
-       shufps xmm6, xmm6, 0b
-       movaps [esp + .krf], xmm5
-       movaps [esp + .crf], xmm6
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, xmm3        
-       movss xmm5, [edx + ebx*4 + 4]   
-       movss xmm6, [ebp + %$facel]
-       mulss  xmm3, xmm3
-       mulss  xmm4, xmm5
-       mulss  xmm5, xmm5
-       mulss  xmm3, xmm6
-       mulss  xmm4, xmm6
-       mulss  xmm5, xmm6
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .qqOO], xmm3
-       movaps [esp + .qqOH], xmm4
-       movaps [esp + .qqHH], xmm5
-               
-       xorps xmm0, xmm0
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       mov   edx, ecx
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       add   edx, ecx
-       mov   eax, [ebp + %$nbfp]
-       movlps xmm0, [eax + edx*4] 
-       movaps xmm1, xmm0
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 01010101b
-       movaps [esp + .c6], xmm0
-       movaps [esp + .c12], xmm1
-
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx 
-       
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .single_check
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4] 
-       mov   ecx, [edx + 8]
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-       
-       ; move j coordinates to local temp. variables
-       movlps xmm2, [esi + eax*4]
-       movlps xmm3, [esi + eax*4 + 12]
-       movlps xmm4, [esi + eax*4 + 24]
-
-       movlps xmm5, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 12]
-       movlps xmm7, [esi + ebx*4 + 24]
-
-       movhps xmm2, [esi + ecx*4]
-       movhps xmm3, [esi + ecx*4 + 12]
-       movhps xmm4, [esi + ecx*4 + 24]
-
-       movhps xmm5, [esi + edx*4]
-       movhps xmm6, [esi + edx*4 + 12]
-       movhps xmm7, [esi + edx*4 + 24]
-
-       ;; current state:       
-       ;;  xmm2= jxOa  jyOa  jxOc  jyOc
-       ;;  xmm3= jxH1a jyH1a jxH1c jyH1c
-       ;;  xmm4= jxH2a jyH2a jxH2c jyH2c
-       ;;  xmm5= jxOb  jyOb  jxOd  jyOd
-       ;;  xmm6= jxH1b jyH1b jxH1d jyH1d
-       ;;  xmm7= jxH2b jyH2b jxH2d jyH2d
-       
-       movaps xmm0, xmm2
-       movaps xmm1, xmm3
-       unpcklps xmm0, xmm5     ; xmm0= jxOa  jxOb  jyOa  jyOb
-       unpcklps xmm1, xmm6     ; xmm1= jxH1a jxH1b jyH1a jyH1b
-       unpckhps xmm2, xmm5     ; xmm2= jxOc  jxOd  jyOc  jyOd
-       unpckhps xmm3, xmm6     ; xmm3= jxH1c jxH1d jyH1c jyH1d 
-       movaps xmm5, xmm4
-       movaps   xmm6, xmm0
-       unpcklps xmm4, xmm7     ; xmm4= jxH2a jxH2b jyH2a jyH2b         
-       unpckhps xmm5, xmm7     ; xmm5= jxH2c jxH2d jyH2c jyH2d
-       movaps   xmm7, xmm1
-       movlhps  xmm0, xmm2     ; xmm0= jxOa  jxOb  jxOc  jxOd 
-       movaps [esp + .jxO], xmm0
-       movhlps  xmm2, xmm6     ; xmm2= jyOa  jyOb  jyOc  jyOd
-       movaps [esp + .jyO], xmm2
-       movlhps  xmm1, xmm3
-       movaps [esp + .jxH1], xmm1
-       movhlps  xmm3, xmm7
-       movaps   xmm6, xmm4
-       movaps [esp + .jyH1], xmm3
-       movlhps  xmm4, xmm5
-       movaps [esp + .jxH2], xmm4
-       movhlps  xmm5, xmm6
-       movaps [esp + .jyH2], xmm5
-
-       movss  xmm0, [esi + eax*4 + 8]
-       movss  xmm1, [esi + eax*4 + 20]
-       movss  xmm2, [esi + eax*4 + 32]
-
-       movss  xmm3, [esi + ecx*4 + 8]
-       movss  xmm4, [esi + ecx*4 + 20]
-       movss  xmm5, [esi + ecx*4 + 32]
-
-       movhps xmm0, [esi + ebx*4 + 4]
-       movhps xmm1, [esi + ebx*4 + 16]
-       movhps xmm2, [esi + ebx*4 + 28]
-       
-       movhps xmm3, [esi + edx*4 + 4]
-       movhps xmm4, [esi + edx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 28]
-       
-       shufps xmm0, xmm3, 11001100b
-       shufps xmm1, xmm4, 11001100b
-       shufps xmm2, xmm5, 11001100b
-       movaps [esp + .jzO],  xmm0
-       movaps [esp + .jzH1],  xmm1
-       movaps [esp + .jzH2],  xmm2
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixO]
-       movaps xmm4, [esp + .iyO]
-       movaps xmm5, [esp + .izO]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxOH1], xmm3
-       movaps [esp + .dyOH1], xmm4
-       movaps [esp + .dzOH1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOO], xmm0
-       movaps [esp + .rsqOH1], xmm3
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       subps  xmm3, [esp + .jxO]
-       subps  xmm4, [esp + .jyO]
-       subps  xmm5, [esp + .jzO]
-       movaps [esp + .dxOH2], xmm0
-       movaps [esp + .dyOH2], xmm1
-       movaps [esp + .dzOH2], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1O], xmm3
-       movaps [esp + .dyH1O], xmm4
-       movaps [esp + .dzH1O], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOH2], xmm0
-       movaps [esp + .rsqH1O], xmm3
-
-       movaps xmm0, [esp + .ixH1]
-       movaps xmm1, [esp + .iyH1]
-       movaps xmm2, [esp + .izH1]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH1]
-       subps  xmm1, [esp + .jyH1]
-       subps  xmm2, [esp + .jzH1]
-       subps  xmm3, [esp + .jxH2]
-       subps  xmm4, [esp + .jyH2]
-       subps  xmm5, [esp + .jzH2]
-       movaps [esp + .dxH1H1], xmm0
-       movaps [esp + .dyH1H1], xmm1
-       movaps [esp + .dzH1H1], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1H2], xmm3
-       movaps [esp + .dyH1H2], xmm4
-       movaps [esp + .dzH1H2], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqH1H1], xmm0
-       movaps [esp + .rsqH1H2], xmm3
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxH2O], xmm0
-       movaps [esp + .dyH2O], xmm1
-       movaps [esp + .dzH2O], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH2H1], xmm3
-       movaps [esp + .dyH2H1], xmm4
-       movaps [esp + .dzH2H1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       movaps [esp + .rsqH2O], xmm0
-       movaps [esp + .rsqH2H1], xmm4
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       movaps [esp + .dxH2H2], xmm0
-       movaps [esp + .dyH2H2], xmm1
-       movaps [esp + .dzH2H2], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2
-       movaps [esp + .rsqH2H2], xmm0
-               
-       ; start doing invsqrt. use rsq values in xmm0, xmm4
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinvH2H2
-       mulps   xmm7, [esp + .half] ; rinvH2H1
-       movaps  [esp + .rinvH2H2], xmm3
-       movaps  [esp + .rinvH2H1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOO]
-       rsqrtps xmm5, [esp + .rsqOH1]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOO]
-       mulps   xmm5, [esp + .rsqOH1]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOO], xmm3
-       movaps  [esp + .rinvOH1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOH2]
-       rsqrtps xmm5, [esp + .rsqH1O]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOH2]
-       mulps   xmm5, [esp + .rsqH1O]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOH2], xmm3
-       movaps  [esp + .rinvH1O], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH1H1]
-       rsqrtps xmm5, [esp + .rsqH1H2]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqH1H1]
-       mulps   xmm5, [esp + .rsqH1H2]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvH1H1], xmm3
-       movaps  [esp + .rinvH1H2], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH2O]
-       movaps  xmm2, xmm1
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, [esp + .rsqH2O]
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2
-       mulps   xmm3, [esp + .half] 
-       movaps  [esp + .rinvH2O], xmm3
-
-       ;; start with OO interaction.
-       movaps xmm0, [esp + .rinvOO]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]
-       mulps  xmm0, xmm0
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ; xmm1=rinvsix
-       mulps  xmm5, [esp + .rsqOO] ;  xmm5=krsq
-       movaps xmm6, xmm5
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-       subps  xmm6, [esp + .crf]
-       
-       mulps  xmm6, [esp + .qqOO] ;  xmm6=voul=qq*(rinv+krsq-crf)
-       mulps xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOO] ; xmm7 = coul part of fscal
-       
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=rinvtwelve
-       mulps  xmm1, [esp + .c6]        
-       mulps  xmm2, [esp + .c12]       
-       movaps xmm3, xmm2
-       subps  xmm3, xmm1       ; xmm3=vnb12-vnb6
-       addps  xmm3, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       movaps [esp + .vnbtot], xmm3
-       subps  xmm2, xmm1
-       addps  xmm2, xmm7
-       addps  xmm6, [esp + .vctot] ;  local vctot summation variable
-       mulps  xmm0, xmm2
-       
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOO]
-       mulps xmm1, [esp + .dyOO]
-       mulps xmm2, [esp + .dzOO]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H1 interaction
-       movaps xmm0, [esp + .rinvOH1]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqOH1] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       mulps  xmm0, xmm0
-       subps  xmm4, [esp + .crf]
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH1 
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH1]
-       mulps xmm1, [esp + .dyOH1]
-       mulps xmm2, [esp + .dzOH1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H2 interaction
-       movaps xmm0, [esp + .rinvOH2]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqOH2] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       mulps xmm0, xmm0
-       subps  xmm4, [esp + .crf]
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH2]
-       mulps xmm1, [esp + .dyOH2]
-       mulps xmm2, [esp + .dzOH2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; H1-O interaction
-       movaps xmm0, [esp + .rinvH1O]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH1O] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       mulps xmm0, xmm0
-       subps  xmm4, [esp + .crf]
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH1O]
-       mulps xmm1, [esp + .dyH1O]
-       mulps xmm2, [esp + .dzH1O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H1 interaction
-       movaps xmm0, [esp + .rinvH1H1]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH1H1] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH1H1]
-       mulps xmm1, [esp + .dyH1H1]
-       mulps xmm2, [esp + .dzH1H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H2 interaction
-       movaps xmm0, [esp + .rinvH1H2]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH1H2] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       mulps xmm0, xmm0
-       subps  xmm4, [esp + .crf]
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH1H2]
-       mulps xmm1, [esp + .dyH1H2]
-       mulps xmm2, [esp + .dzH1H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H2-O interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH2O] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH2O]
-       mulps xmm1, [esp + .dyH2O]
-       mulps xmm2, [esp + .dzH2O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H1 interaction
-       movaps xmm0, [esp + .rinvH2H1]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH2H1] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH2H1]
-       mulps xmm1, [esp + .dyH2H1]
-       mulps xmm2, [esp + .dzH2H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H2 interaction
-       movaps xmm0, [esp + .rinvH2H2]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH2H2] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm1, xmm0
-       movaps [esp + .vctot], xmm6
-       movaps xmm2, xmm0
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH2H2]
-       mulps xmm1, [esp + .dyH2H2]
-       mulps xmm2, [esp + .dzH2H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       mov edi, [ebp +%$faction]
-               
-       ; Did all interactions - now update j forces.
-       ; 4 j waters with three atoms each - first do a & b j particles
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd 
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpcklps xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjxOb  fjyOb
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOb  fjyOb
-       unpcklps xmm1, xmm2        ; xmm1= fjzOa  fjxH1a fjzOb  fjxH1b
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpcklps xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1b fjzH1b
-       unpcklps xmm5, xmm6        ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
-       movlhps  xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjzOa  fjxH1a  
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOb  fjyOb  fjzOb  fjxH1b
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
-       movups   xmm1, [edi + eax*4]
-       movups   xmm2, [edi + eax*4 + 16]
-       movups   xmm5, [edi + ebx*4]
-       movups   xmm6, [edi + ebx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + eax*4 + 32]
-       movss    xmm3, [edi + ebx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm7, xmm7, 1b
-       
-       movups   [edi + eax*4],     xmm1
-       movups   [edi + eax*4 + 16],xmm2
-       movups   [edi + ebx*4],     xmm5
-       movups   [edi + ebx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + eax*4 + 32], xmm0
-       movss    [edi + ebx*4 + 32], xmm3       
-
-       ;; then do the second pair (c & d)
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpckhps xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjxOd  fjyOd
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOd  fjyOd
-       unpckhps xmm1, xmm2        ; xmm1= fjzOc  fjxH1c fjzOd  fjxH1d
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpckhps xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d  
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1d fjzH1d         
-       unpckhps xmm5, xmm6        ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
-       movlhps  xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjzOc  fjxH1c 
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOd  fjyOd  fjzOd  fjxH1d
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c 
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
-       movups   xmm1, [edi + ecx*4]
-       movups   xmm2, [edi + ecx*4 + 16]
-       movups   xmm5, [edi + edx*4]
-       movups   xmm6, [edi + edx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + ecx*4 + 32]
-       movss    xmm3, [edi + edx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm4, xmm4, 10b
-       shufps   xmm7, xmm7, 11b
-       movups   [edi + ecx*4],     xmm1
-       movups   [edi + ecx*4 + 16],xmm2
-       movups   [edi + edx*4],     xmm5
-       movups   [edi + edx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + ecx*4 + 32], xmm0
-       movss    [edi + edx*4 + 32], xmm3       
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .single_check
-       jmp   .unroll_loop
-.single_check:
-       add   [esp + .innerk], dword 4
-       jnz   .single_loop
-       jmp   .updateouterdata
-.single_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-
-       ; fetch j coordinates
-       xorps xmm3, xmm3
-       xorps xmm4, xmm4
-       xorps xmm5, xmm5
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + eax*4 + 4]
-       movss xmm5, [esi + eax*4 + 8]
-
-       movlps xmm6, [esi + eax*4 + 12]
-       movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
-       ;;  fetch both z coords in one go, to positions 0 and 3 in xmm7
-       movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
-       shufps xmm6, xmm6, 11011000b    ;  xmm6=jxH1 jxH2 jyH1 jyH2
-       movlhps xmm3, xmm6              ; xmm3= jxO   0  jxH1 jxH2 
-       movaps  xmm0, [esp + .ixO]     
-       movaps  xmm1, [esp + .iyO]
-       movaps  xmm2, [esp + .izO]      
-       shufps  xmm4, xmm6, 11100100b ;  xmm4= jyO   0   jyH1 jyH2
-       shufps xmm5, xmm7, 11000100b  ;  xmm5= jzO   0   jzH1 jzH2  
-       ;;  store all j coordinates in jO
-       movaps [esp + .jxO], xmm3
-       movaps [esp + .jyO], xmm4
-       movaps [esp + .jzO], xmm5
-       subps  xmm0, xmm3
-       subps  xmm1, xmm4
-       subps  xmm2, xmm5
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2        ;  have rsq in xmm0.
-
-       movaps xmm6, xmm0
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       mulps   xmm6, [esp + .krf] ; xmm6=krsq
-       movaps  xmm2, xmm1
-       movaps  xmm7, xmm6
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, xmm0
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2                                                      
-       mulps   xmm3, [esp + .half] ; rinv iO - j water
-
-       addps   xmm6, xmm3      ;  xmm6=rinv+krsq
-       mulps   xmm7, [esp + .two]
-       subps  xmm6, [esp + .crf]       ;  xmm6=rinv+krsq-crf
-       
-       xorps   xmm1, xmm1
-       movaps  xmm0, xmm3
-       subps   xmm3, xmm7      ; xmm3=rinv-2*krsq
-       xorps   xmm4, xmm4
-       mulps   xmm0, xmm0      ; xmm0=rinvsq
-       ;; fetch charges to xmm4 (temporary)
-       movss   xmm4, [esp + .qqOO]
-       movss   xmm1, xmm0
-       movhps  xmm4, [esp + .qqOH]
-       mulss   xmm1, xmm0
-
-       mulps xmm6, xmm4        ;  vcoul 
-       mulps xmm3, xmm4        ;  coul part of fs
-       
-       mulss   xmm1, xmm0      ;  xmm1(0)=rinvsix
-       movaps  xmm2, xmm1      ;  zero everything else in xmm2
-       mulss   xmm2, xmm2      ;  xmm2=rinvtwelve
-
-       mulss   xmm1, [esp + .c6]
-       mulss   xmm2, [esp + .c12]
-       movaps  xmm4, xmm2
-       subss   xmm4, xmm1      ;  vnbtot=vnb12-vnb6
-       addps   xmm4, [esp + .vnbtot]
-       mulss   xmm1, [esp + .six]
-       mulss   xmm2, [esp + .twelve]   
-       movaps  [esp + .vnbtot], xmm4
-       subss   xmm2, xmm1      ; fsD+fsR
-       addps   xmm2, xmm3      ; fsC+fsD+fsR
-
-       addps   xmm6, [esp + .vctot]
-       mulps   xmm0, xmm2      ;  total fscal
-       movaps  [esp + .vctot], xmm6    
-
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxOO]
-       mulps   xmm1, [esp + .dyOO]
-       mulps   xmm2, [esp + .dzOO]
-       
-       ;; initial update for j forces
-       xorps   xmm3, xmm3
-       xorps   xmm4, xmm4
-       xorps   xmm5, xmm5
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixO]
-       addps   xmm1, [esp + .fiyO]
-       addps   xmm2, [esp + .fizO]
-       movaps  [esp + .fixO], xmm0
-       movaps  [esp + .fiyO], xmm1
-       movaps  [esp + .fizO], xmm2
-
-       
-       ;;  done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
-       movaps  xmm0, [esp + .ixH1]
-       movaps  xmm1, [esp + .iyH1]
-       movaps  xmm2, [esp + .izH1]     
-       movaps  xmm3, [esp + .ixH2] 
-       movaps  xmm4, [esp + .iyH2] 
-       movaps  xmm5, [esp + .izH2] 
-       subps   xmm0, [esp + .jxO]
-       subps   xmm1, [esp + .jyO]
-       subps   xmm2, [esp + .jzO]
-       subps   xmm3, [esp + .jxO]
-       subps   xmm4, [esp + .jyO]
-       subps   xmm5, [esp + .jzO]
-       movaps [esp + .dxH1O], xmm0
-       movaps [esp + .dyH1O], xmm1
-       movaps [esp + .dzH1O], xmm2
-       movaps [esp + .dxH2O], xmm3
-       movaps [esp + .dyH2O], xmm4
-       movaps [esp + .dzH2O], xmm5
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       mulps xmm3, xmm3
-       mulps xmm4, xmm4
-       mulps xmm5, xmm5
-       addps xmm0, xmm1
-       addps xmm4, xmm3
-       addps xmm0, xmm2        ;  have rsqH1 in xmm0.
-       addps xmm4, xmm5        ;  have rsqH2 in xmm4.
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinv H1 - j water
-       mulps   xmm7, [esp + .half] ; rinv H2 - j water
-
-       mulps xmm0, [esp + .krf] ;  krsq
-       mulps xmm4, [esp + .krf] ;  krsq 
-
-
-       ;; assemble charges in xmm6
-       xorps   xmm6, xmm6
-       movss   xmm6, [esp + .qqOH]
-       movhps  xmm6, [esp + .qqHH]
-       movaps  xmm1, xmm0
-       movaps  xmm5, xmm4
-       addps   xmm0, xmm3      ; krsq+rinv
-       addps   xmm4, xmm7      ; krsq+rinv
-       subps xmm0, [esp + .crf]
-       subps xmm4, [esp + .crf]
-       mulps   xmm1, [esp + .two]
-       mulps   xmm5, [esp + .two]
-       mulps   xmm0, xmm6      ;  vcoul
-       mulps   xmm4, xmm6      ;  vcoul
-       addps   xmm4, xmm0              
-       addps   xmm4, [esp + .vctot]
-       movaps  [esp + .vctot], xmm4
-       movaps  xmm0, xmm3
-       movaps  xmm4, xmm7
-       mulps   xmm3, xmm3
-       mulps   xmm7, xmm7
-       subps   xmm0, xmm1
-       subps   xmm4, xmm5
-       mulps   xmm0, xmm6
-       mulps   xmm4, xmm6
-       mulps   xmm0, xmm3      ;  fscal
-       mulps   xmm7, xmm4      ;  fscal
-       
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxH1O]
-       mulps   xmm1, [esp + .dyH1O]
-       mulps   xmm2, [esp + .dzH1O]
-       ;;  update forces H1 - j water
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH1]
-       addps   xmm1, [esp + .fiyH1]
-       addps   xmm2, [esp + .fizH1]
-       movaps  [esp + .fixH1], xmm0
-       movaps  [esp + .fiyH1], xmm1
-       movaps  [esp + .fizH1], xmm2
-       ;; do forces H2 - j water
-       movaps xmm0, xmm7
-       movaps xmm1, xmm7
-       movaps xmm2, xmm7
-       mulps   xmm0, [esp + .dxH2O]
-       mulps   xmm1, [esp + .dyH2O]
-       mulps   xmm2, [esp + .dzH2O]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       mov     esi, [ebp + %$faction]
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH2]
-       addps   xmm1, [esp + .fiyH2]
-       addps   xmm2, [esp + .fizH2]
-       movaps  [esp + .fixH2], xmm0
-       movaps  [esp + .fiyH2], xmm1
-       movaps  [esp + .fizH2], xmm2
-
-       ;; update j water forces from local variables
-       movlps  xmm0, [esi + eax*4]
-       movlps  xmm1, [esi + eax*4 + 12]
-       movhps  xmm1, [esi + eax*4 + 24]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       movaps  xmm6, xmm5
-       movaps  xmm7, xmm5
-       shufps  xmm6, xmm6, 10b
-       shufps  xmm7, xmm7, 11b
-       addss   xmm5, [esi + eax*4 + 8]
-       addss   xmm6, [esi + eax*4 + 20]
-       addss   xmm7, [esi + eax*4 + 32]
-       movss   [esi + eax*4 + 8], xmm5
-       movss   [esi + eax*4 + 20], xmm6
-       movss   [esi + eax*4 + 32], xmm7
-       movaps   xmm5, xmm3
-       unpcklps xmm3, xmm4
-       unpckhps xmm5, xmm4
-       addps    xmm0, xmm3
-       addps    xmm1, xmm5
-       movlps  [esi + eax*4], xmm0 
-       movlps  [esi + eax*4 + 12], xmm1 
-       movhps  [esi + eax*4 + 24], xmm1 
-       
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .single_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO] 
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
- 
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 1540
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-
-proc inl2020_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$krf          arg     
-%$crf          arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.iqO         equ   144 
-.iqH         equ   160 
-.dxO         equ   176
-.dyO         equ   192
-.dzO         equ   208 
-.dxH1        equ   224
-.dyH1        equ   240
-.dzH1        equ   256 
-.dxH2        equ   272
-.dyH2        equ   288
-.dzH2        equ   304 
-.qqO         equ   320
-.qqH         equ   336
-.vctot       equ   352
-.fixO        equ   384
-.fiyO        equ   400
-.fizO        equ   416
-.fixH1       equ   432
-.fiyH1       equ   448
-.fizH1       equ   464
-.fixH2       equ   480
-.fiyH2       equ   496
-.fizH2       equ   512
-.fjx        equ   528
-.fjy         equ   544
-.fjz         equ   560
-.half        equ   576
-.three       equ   592
-.two        equ   608
-.krf        equ   624
-.crf        equ   640
-.krsqO       equ   656
-.krsqH1      equ   672
-.krsqH2             equ   688                  
-.is3         equ   704
-.ii3         equ   708
-.innerjjnr   equ   712
-.innerk      equ   716
-.salign             equ   720                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 724            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm4, [sse_two]
-       movss xmm5, [ebp + %$krf]
-       movss xmm6, [ebp + %$crf]
-
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .two], xmm4
-       shufps xmm5, xmm5, 0b
-       shufps xmm6, xmm6, 0b
-       movaps [esp + .krf], xmm5
-       movaps [esp + .crf], xmm6
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, [edx + ebx*4 + 4]   
-       movss xmm5, [ebp + %$facel]
-       mulss  xmm3, xmm5
-       mulss  xmm4, xmm5
-
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       movaps [esp + .iqO], xmm3
-       movaps [esp + .iqH], xmm4
-                       
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .odd_inner
-.unroll_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movaps xmm4, xmm3            ;  and in xmm4
-       mulps  xmm3, [esp + .iqO]
-       mulps  xmm4, [esp + .iqH]
-
-       movaps  [esp + .qqO], xmm3
-       movaps  [esp + .qqH], xmm4
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ixO-izO to xmm4-xmm6
-       movaps xmm4, [esp + .ixO]
-       movaps xmm5, [esp + .iyO]
-       movaps xmm6, [esp + .izO]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxO], xmm4
-       movaps [esp + .dyO], xmm5
-       movaps [esp + .dzO], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       movaps xmm7, xmm4
-       ; rsqO in xmm7
-
-       ; move ixH1-izH1 to xmm4-xmm6
-       movaps xmm4, [esp + .ixH1]
-       movaps xmm5, [esp + .iyH1]
-       movaps xmm6, [esp + .izH1]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxH1], xmm4
-       movaps [esp + .dyH1], xmm5
-       movaps [esp + .dzH1], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm6, xmm5
-       addps xmm6, xmm4
-       ; rsqH1 in xmm6
-
-       ; move ixH2-izH2 to xmm3-xmm5
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-
-       ; calc dr
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-
-       ; store dr
-       movaps [esp + .dxH2], xmm3
-       movaps [esp + .dyH2], xmm4
-       movaps [esp + .dzH2], xmm5
-       ; square it
-       mulps xmm3,xmm3
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       addps xmm5, xmm4
-       addps xmm5, xmm3
-       ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
-       movaps xmm0, xmm5
-       movaps xmm1, xmm6
-       movaps xmm2, xmm7
-
-       mulps  xmm0, [esp + .krf]       
-       mulps  xmm1, [esp + .krf]       
-       mulps  xmm2, [esp + .krf]       
-
-       movaps [esp + .krsqH2], xmm0
-       movaps [esp + .krsqH1], xmm1
-       movaps [esp + .krsqO], xmm2
-       
-       ; start with rsqO - seed in xmm2        
-       rsqrtps xmm2, xmm7
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm7      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm7, xmm4      ; rinvO in xmm7
-       ; rsqH1 - seed in xmm2  
-       rsqrtps xmm2, xmm6
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm6      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm6, xmm4      ; rinvH1 in xmm6
-       ; rsqH2 - seed in xmm2  
-       rsqrtps xmm2, xmm5
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm5      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  xmm5, xmm4      ; rinvH2 in xmm5
-
-       ; do O interactions
-       movaps  xmm4, xmm7      
-       mulps   xmm4, xmm4      ; xmm7=rinv, xmm4=rinvsq
-
-       movaps xmm0, xmm7
-       movaps xmm1, [esp + .krsqO]
-       addps  xmm0, xmm1
-       subps  xmm0, [esp + .crf] ;  xmm0=rinv+krsq-crf
-       mulps  xmm1, [esp + .two]
-       subps  xmm7, xmm1
-       mulps  xmm0, [esp + .qqO]
-       mulps  xmm7, [esp + .qqO]
-
-       mulps  xmm4, xmm7       ; total fsO in xmm4
-
-       addps  xmm0, [esp + .vctot]
-       movaps [esp + .vctot], xmm0
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update O forces
-       movaps xmm3, [esp + .fixO]
-       movaps xmm4, [esp + .fiyO]
-       movaps xmm7, [esp + .fizO]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixO], xmm3
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm7
-       ; update j forces with water O
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H1 interactions
-       movaps  xmm4, xmm6      
-       mulps   xmm4, xmm4      ; xmm6=rinv, xmm4=rinvsq
-       movaps  xmm7, xmm6
-       movaps  xmm0, [esp + .krsqH1]
-       addps   xmm6, xmm0      ; xmm6=rinv+krsq
-       subps   xmm6, [esp + .crf] ;  xmm6=rinv+krsq-crf
-       mulps   xmm0, [esp + .two]
-       subps   xmm7, xmm0      ; xmm7=rinv-2*krsq
-       mulps   xmm6, [esp + .qqH] ;  vcoul
-       mulps   xmm7, [esp + .qqH]
-       mulps  xmm4, xmm7               ; total fsH1 in xmm4
-       
-       addps  xmm6, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH1]
-       movaps xmm1, [esp + .dyH1]
-       movaps xmm2, [esp + .dzH1]
-       movaps [esp + .vctot], xmm6
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H1 forces
-       movaps xmm3, [esp + .fixH1]
-       movaps xmm4, [esp + .fiyH1]
-       movaps xmm7, [esp + .fizH1]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH1], xmm3
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm7
-       ; update j forces with water H1
-       addps  xmm0, [esp + .fjx]
-       addps  xmm1, [esp + .fjy]
-       addps  xmm2, [esp + .fjz]
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; H2 interactions
-       movaps  xmm4, xmm5      
-       mulps   xmm4, xmm4      ; xmm5=rinv, xmm4=rinvsq
-       movaps  xmm7, xmm5
-       movaps  xmm0, [esp + .krsqH2]
-       addps   xmm5, xmm0      ; xmm6=rinv+krsq
-       subps   xmm5, [esp + .crf] ;  xmm5=rinv+krsq-crf
-       mulps   xmm0, [esp + .two]
-       subps   xmm7, xmm0      ; xmm7=rinv-2*krsq
-       mulps   xmm5, [esp + .qqH] ;  vcoul
-       mulps   xmm7, [esp + .qqH]
-       mulps  xmm4, xmm7               ; total fsH2 in xmm4
-       
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm0, [esp + .dxH2]
-       movaps xmm1, [esp + .dyH2]
-       movaps xmm2, [esp + .dzH2]
-       movaps [esp + .vctot], xmm5
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H2 forces
-       movaps xmm3, [esp + .fixH2]
-       movaps xmm4, [esp + .fiyH2]
-       movaps xmm7, [esp + .fizH2]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH2], xmm3
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm7
-
-       mov edi, [ebp +%$faction]
-       ; update j forces
-       addps xmm0, [esp + .fjx]
-       addps xmm1, [esp + .fjy]
-       addps xmm2, [esp + .fjz]
-
-       movlps xmm4, [edi + eax*4]
-       movlps xmm7, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm7, [edi + edx*4]
-       
-       movaps xmm3, xmm4
-       shufps xmm3, xmm7, 10001000b
-       shufps xmm4, xmm7, 11011101b                          
-       ; xmm3 has fjx, xmm4 has fjy.
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       ; unpack the back for storing.
-       movaps xmm7, xmm3
-       unpcklps xmm7, xmm4
-       unpckhps xmm3, xmm4     
-       movlps [edi + eax*4], xmm7
-       movlps [edi + ecx*4], xmm3
-       movhps [edi + ebx*4], xmm7
-       movhps [edi + edx*4], xmm3
-       ; finally z forces 
-       movss  xmm0, [edi + eax*4 + 8]
-       movss  xmm1, [edi + ebx*4 + 8]
-       movss  xmm3, [edi + ecx*4 + 8]
-       movss  xmm4, [edi + edx*4 + 8]
-       subss  xmm0, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm1, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm3, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm4, xmm2
-       movss  [edi + eax*4 + 8], xmm0
-       movss  [edi + ebx*4 + 8], xmm1
-       movss  [edi + ecx*4 + 8], xmm3
-       movss  [edi + edx*4 + 8], xmm4
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .odd_inner
-       jmp   .unroll_loop
-.odd_inner:    
-       add   [esp + .innerk], dword 4
-       jnz   .odd_loop
-       jmp   .updateouterdata
-.odd_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       xorps xmm4, xmm4
-       movss xmm4, [esp + .iqO]
-       mov esi, [ebp + %$charge] 
-       movhps xmm4, [esp + .iqH]     
-       movss xmm3, [esi + eax*4]       ; charge in xmm3
-       shufps xmm3, xmm3, 0b
-       mulps xmm3, xmm4
-       movaps [esp + .qqO], xmm3       ; use oxygen qq for storage.
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-       
-       ; move j coords to xmm0-xmm2
-       movss xmm0, [esi + eax*4]
-       movss xmm1, [esi + eax*4 + 4]
-       movss xmm2, [esi + eax*4 + 8]
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       
-       movss xmm3, [esp + .ixO]
-       movss xmm4, [esp + .iyO]
-       movss xmm5, [esp + .izO]
-               
-       movlps xmm6, [esp + .ixH1]
-       movlps xmm7, [esp + .ixH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm3, xmm6
-       movlps xmm6, [esp + .iyH1]
-       movlps xmm7, [esp + .iyH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm4, xmm6
-       movlps xmm6, [esp + .izH1]
-       movlps xmm7, [esp + .izH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm5, xmm6
-
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       
-       movaps [esp + .dxO], xmm3
-       movaps [esp + .dyO], xmm4
-       movaps [esp + .dzO], xmm5
-
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       ; rsq in xmm4.
-
-       movaps xmm0, xmm4
-       mulps xmm0, [esp + .krf]
-       movaps [esp + .krsqO], xmm0
-       
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm4       ; xmm4=rinvsq
-
-       movaps xmm1, xmm0       ; xmm1=rinv
-       movaps xmm3, [esp + .krsqO]
-       addps  xmm0, xmm3       ; xmm0=rinv+krsq
-       subps  xmm0, [esp + .crf] ;  xmm0=rinv+krsq-crf
-       mulps  xmm3, [esp + .two]
-       subps  xmm1, xmm3       ; xmm1=rinv-2*krsq
-       mulps  xmm0, [esp + .qqO]       ; xmm0=vcoul
-       mulps  xmm1, [esp + .qqO]       ; xmm1=coul part of fs
-
-       
-       mulps  xmm4, xmm1       ; xmm4=total fscal
-       addps  xmm0, [esp + .vctot]
-       movaps [esp + .vctot], xmm0
-       
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       movss  xmm3, [esp + .fixO]      
-       movss  xmm4, [esp + .fiyO]      
-       movss  xmm5, [esp + .fizO]      
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esp + .fixO], xmm3      
-       movss  [esp + .fiyO], xmm4      
-       movss  [esp + .fizO], xmm5      ; updated the O force. now do the H's
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       shufps xmm3, xmm3, 11100110b    ; shift right
-       shufps xmm4, xmm4, 11100110b
-       shufps xmm5, xmm5, 11100110b
-       addss  xmm3, [esp + .fixH1]
-       addss  xmm4, [esp + .fiyH1]
-       addss  xmm5, [esp + .fizH1]
-       movss  [esp + .fixH1], xmm3     
-       movss  [esp + .fiyH1], xmm4     
-       movss  [esp + .fizH1], xmm5     ; updated the H1 force. 
-
-       mov edi, [ebp + %$faction]
-       shufps xmm3, xmm3, 11100111b    ; shift right
-       shufps xmm4, xmm4, 11100111b
-       shufps xmm5, xmm5, 11100111b
-       addss  xmm3, [esp + .fixH2]
-       addss  xmm4, [esp + .fiyH2]
-       addss  xmm5, [esp + .fizH2]
-       movss  [esp + .fixH2], xmm3     
-       movss  [esp + .fiyH2], xmm4     
-       movss  [esp + .fizH2], xmm5     ; updated the H2 force. 
-
-       ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
-       xorps  xmm5, xmm5
-       movaps xmm3, xmm0
-       movlps xmm6, [edi + eax*4]
-       movss  xmm7, [edi + eax*4 + 8]
-       unpcklps xmm3, xmm1
-       movlhps  xmm3, xmm5     
-       unpckhps xmm0, xmm1             
-       addps    xmm0, xmm3
-       movhlps  xmm3, xmm0     
-       addps    xmm0, xmm3     ; x,y sum in xmm0
-
-       movhlps  xmm1, xmm2
-       addss    xmm2, xmm1
-       shufps   xmm1, xmm1, 1b 
-       addss    xmm2, xmm1    ; z sum in xmm2
-       subps    xmm6, xmm0
-       subss    xmm7, xmm2
-       
-       movlps [edi + eax*4],     xmm6
-       movss  [edi + eax*4 + 8], xmm7
-
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .odd_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO]
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       mov   edx, [ebp + %$gid]  
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4    
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-        
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 724
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-       
-proc inl2030_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$krf          arg
-%$crf          arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.jxO        equ   144
-.jyO        equ   160
-.jzO         equ   176
-.jxH1       equ   192
-.jyH1       equ   208
-.jzH1        equ   224
-.jxH2       equ   240
-.jyH2       equ   256
-.jzH2        equ   272
-.dxOO        equ   288
-.dyOO        equ   304
-.dzOO        equ   320 
-.dxOH1       equ   336
-.dyOH1       equ   352
-.dzOH1       equ   368 
-.dxOH2       equ   384
-.dyOH2       equ   400
-.dzOH2       equ   416 
-.dxH1O       equ   432
-.dyH1O       equ   448
-.dzH1O       equ   464 
-.dxH1H1      equ   480
-.dyH1H1      equ   496
-.dzH1H1      equ   512 
-.dxH1H2      equ   528
-.dyH1H2      equ   544
-.dzH1H2      equ   560 
-.dxH2O       equ   576
-.dyH2O       equ   592
-.dzH2O       equ   608 
-.dxH2H1      equ   624
-.dyH2H1      equ   640
-.dzH2H1      equ   656 
-.dxH2H2      equ   672
-.dyH2H2      equ   688
-.dzH2H2      equ   704
-.qqOO        equ   720
-.qqOH        equ   736
-.qqHH        equ   752
-.vctot       equ   768
-.fixO        equ   784
-.fiyO        equ   800
-.fizO        equ   816
-.fixH1       equ   832
-.fiyH1       equ   848
-.fizH1       equ   864
-.fixH2       equ   880
-.fiyH2       equ   896
-.fizH2       equ   912
-.fjxO       equ   928
-.fjyO        equ   944
-.fjzO        equ   960
-.fjxH1      equ   976
-.fjyH1       equ   992
-.fjzH1       equ  1008
-.fjxH2      equ  1024
-.fjyH2       equ  1040
-.fjzH2       equ  1056
-.half        equ  1072
-.three       equ  1088
-.rsqOO       equ  1104
-.rsqOH1      equ  1120
-.rsqOH2      equ  1136
-.rsqH1O      equ  1152
-.rsqH1H1     equ  1168
-.rsqH1H2     equ  1184
-.rsqH2O      equ  1200
-.rsqH2H1     equ  1216
-.rsqH2H2     equ  1232
-.rinvOO      equ  1248
-.rinvOH1     equ  1264
-.rinvOH2     equ  1280
-.rinvH1O     equ  1296
-.rinvH1H1    equ  1312
-.rinvH1H2    equ  1328
-.rinvH2O     equ  1344
-.rinvH2H1    equ  1360
-.rinvH2H2    equ  1376
-.two         equ  1392
-.krf        equ  1408  
-.crf        equ  1424
-.is3         equ  1440
-.ii3         equ  1444
-.innerjjnr   equ  1448
-.innerk      equ  1452
-.salign             equ  1456                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 1460           ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_three]
-       movups xmm4, [sse_two]
-       movss xmm5, [ebp + %$krf]
-       movss xmm6, [ebp + %$crf]
-       
-       movaps [esp + .half],  xmm0
-       movaps [esp + .three], xmm1
-       movaps [esp + .two], xmm4
-       shufps xmm5, xmm5, 0b
-       shufps xmm6, xmm6, 0b
-       movaps [esp + .krf], xmm5
-       movaps [esp + .crf], xmm6
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, xmm3        
-       movss xmm5, [edx + ebx*4 + 4]   
-       movss xmm6, [ebp + %$facel]
-       mulss  xmm3, xmm3
-       mulss  xmm4, xmm5
-       mulss  xmm5, xmm5
-       mulss  xmm3, xmm6
-       mulss  xmm4, xmm6
-       mulss  xmm5, xmm6
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .qqOO], xmm3
-       movaps [esp + .qqOH], xmm4
-       movaps [esp + .qqHH], xmm5
-       
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx 
-       
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .single_check
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4] 
-       mov   ecx, [edx + 8]
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-       
-       ; move j coordinates to local temp. variables
-       movlps xmm2, [esi + eax*4]
-       movlps xmm3, [esi + eax*4 + 12]
-       movlps xmm4, [esi + eax*4 + 24]
-
-       movlps xmm5, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 12]
-       movlps xmm7, [esi + ebx*4 + 24]
-
-       movhps xmm2, [esi + ecx*4]
-       movhps xmm3, [esi + ecx*4 + 12]
-       movhps xmm4, [esi + ecx*4 + 24]
-
-       movhps xmm5, [esi + edx*4]
-       movhps xmm6, [esi + edx*4 + 12]
-       movhps xmm7, [esi + edx*4 + 24]
-
-       ;; current state:       
-       ;;  xmm2= jxOa  jyOa  jxOc  jyOc
-       ;;  xmm3= jxH1a jyH1a jxH1c jyH1c
-       ;;  xmm4= jxH2a jyH2a jxH2c jyH2c
-       ;;  xmm5= jxOb  jyOb  jxOd  jyOd
-       ;;  xmm6= jxH1b jyH1b jxH1d jyH1d
-       ;;  xmm7= jxH2b jyH2b jxH2d jyH2d
-       
-       movaps xmm0, xmm2
-       movaps xmm1, xmm3
-       unpcklps xmm0, xmm5     ; xmm0= jxOa  jxOb  jyOa  jyOb
-       unpcklps xmm1, xmm6     ; xmm1= jxH1a jxH1b jyH1a jyH1b
-       unpckhps xmm2, xmm5     ; xmm2= jxOc  jxOd  jyOc  jyOd
-       unpckhps xmm3, xmm6     ; xmm3= jxH1c jxH1d jyH1c jyH1d 
-       movaps xmm5, xmm4
-       movaps   xmm6, xmm0
-       unpcklps xmm4, xmm7     ; xmm4= jxH2a jxH2b jyH2a jyH2b         
-       unpckhps xmm5, xmm7     ; xmm5= jxH2c jxH2d jyH2c jyH2d
-       movaps   xmm7, xmm1
-       movlhps  xmm0, xmm2     ; xmm0= jxOa  jxOb  jxOc  jxOd 
-       movaps [esp + .jxO], xmm0
-       movhlps  xmm2, xmm6     ; xmm2= jyOa  jyOb  jyOc  jyOd
-       movaps [esp + .jyO], xmm2
-       movlhps  xmm1, xmm3
-       movaps [esp + .jxH1], xmm1
-       movhlps  xmm3, xmm7
-       movaps   xmm6, xmm4
-       movaps [esp + .jyH1], xmm3
-       movlhps  xmm4, xmm5
-       movaps [esp + .jxH2], xmm4
-       movhlps  xmm5, xmm6
-       movaps [esp + .jyH2], xmm5
-
-       movss  xmm0, [esi + eax*4 + 8]
-       movss  xmm1, [esi + eax*4 + 20]
-       movss  xmm2, [esi + eax*4 + 32]
-
-       movss  xmm3, [esi + ecx*4 + 8]
-       movss  xmm4, [esi + ecx*4 + 20]
-       movss  xmm5, [esi + ecx*4 + 32]
-
-       movhps xmm0, [esi + ebx*4 + 4]
-       movhps xmm1, [esi + ebx*4 + 16]
-       movhps xmm2, [esi + ebx*4 + 28]
-       
-       movhps xmm3, [esi + edx*4 + 4]
-       movhps xmm4, [esi + edx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 28]
-       
-       shufps xmm0, xmm3, 11001100b
-       shufps xmm1, xmm4, 11001100b
-       shufps xmm2, xmm5, 11001100b
-       movaps [esp + .jzO],  xmm0
-       movaps [esp + .jzH1],  xmm1
-       movaps [esp + .jzH2],  xmm2
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixO]
-       movaps xmm4, [esp + .iyO]
-       movaps xmm5, [esp + .izO]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxOH1], xmm3
-       movaps [esp + .dyOH1], xmm4
-       movaps [esp + .dzOH1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOO], xmm0
-       movaps [esp + .rsqOH1], xmm3
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       subps  xmm3, [esp + .jxO]
-       subps  xmm4, [esp + .jyO]
-       subps  xmm5, [esp + .jzO]
-       movaps [esp + .dxOH2], xmm0
-       movaps [esp + .dyOH2], xmm1
-       movaps [esp + .dzOH2], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1O], xmm3
-       movaps [esp + .dyH1O], xmm4
-       movaps [esp + .dzH1O], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOH2], xmm0
-       movaps [esp + .rsqH1O], xmm3
-
-       movaps xmm0, [esp + .ixH1]
-       movaps xmm1, [esp + .iyH1]
-       movaps xmm2, [esp + .izH1]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH1]
-       subps  xmm1, [esp + .jyH1]
-       subps  xmm2, [esp + .jzH1]
-       subps  xmm3, [esp + .jxH2]
-       subps  xmm4, [esp + .jyH2]
-       subps  xmm5, [esp + .jzH2]
-       movaps [esp + .dxH1H1], xmm0
-       movaps [esp + .dyH1H1], xmm1
-       movaps [esp + .dzH1H1], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1H2], xmm3
-       movaps [esp + .dyH1H2], xmm4
-       movaps [esp + .dzH1H2], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqH1H1], xmm0
-       movaps [esp + .rsqH1H2], xmm3
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxH2O], xmm0
-       movaps [esp + .dyH2O], xmm1
-       movaps [esp + .dzH2O], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH2H1], xmm3
-       movaps [esp + .dyH2H1], xmm4
-       movaps [esp + .dzH2H1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       movaps [esp + .rsqH2O], xmm0
-       movaps [esp + .rsqH2H1], xmm4
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       movaps [esp + .dxH2H2], xmm0
-       movaps [esp + .dyH2H2], xmm1
-       movaps [esp + .dzH2H2], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2
-       movaps [esp + .rsqH2H2], xmm0
-               
-       ; start doing invsqrt. use rsq values in xmm0, xmm4
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinvH2H2
-       mulps   xmm7, [esp + .half] ; rinvH2H1
-       movaps  [esp + .rinvH2H2], xmm3
-       movaps  [esp + .rinvH2H1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOO]
-       rsqrtps xmm5, [esp + .rsqOH1]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOO]
-       mulps   xmm5, [esp + .rsqOH1]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOO], xmm3
-       movaps  [esp + .rinvOH1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOH2]
-       rsqrtps xmm5, [esp + .rsqH1O]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOH2]
-       mulps   xmm5, [esp + .rsqH1O]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOH2], xmm3
-       movaps  [esp + .rinvH1O], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH1H1]
-       rsqrtps xmm5, [esp + .rsqH1H2]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqH1H1]
-       mulps   xmm5, [esp + .rsqH1H2]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvH1H1], xmm3
-       movaps  [esp + .rinvH1H2], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH2O]
-       movaps  xmm2, xmm1
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, [esp + .rsqH2O]
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2
-       mulps   xmm3, [esp + .half] 
-       movaps  [esp + .rinvH2O], xmm3
-
-       ;; start with OO interaction.
-       movaps xmm0, [esp + .rinvOO]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]
-       mulps  xmm0, xmm0       ;  xmm0=rinvsq
-
-       mulps  xmm5, [esp + .rsqOO] ;  xmm5=krsq
-       movaps xmm6, xmm5
-       addps  xmm6, xmm7       ;  xmm6=rinv+krsq
-       subps  xmm6, [esp + .crf]
-       mulps  xmm6, [esp + .qqOO] ;  xmm6=voul=qq*(rinv+krsq-crf)
-       mulps xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOO] ; xmm7 = coul part of fscal
-       
-       addps  xmm6, [esp + .vctot] ;  local vctot summation variable
-       mulps  xmm0, xmm7
-       
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOO]
-       mulps xmm1, [esp + .dyOO]
-       mulps xmm2, [esp + .dzOO]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H1 interaction
-       movaps xmm0, [esp + .rinvOH1]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqOH1] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps  xmm0, xmm0
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH1 
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH1]
-       mulps xmm1, [esp + .dyOH1]
-       mulps xmm2, [esp + .dzOH1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H2 interaction
-       movaps xmm0, [esp + .rinvOH2]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqOH2] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH2]
-       mulps xmm1, [esp + .dyOH2]
-       mulps xmm2, [esp + .dzOH2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; H1-O interaction
-       movaps xmm0, [esp + .rinvH1O]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH1O] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH1O]
-       mulps xmm1, [esp + .dyH1O]
-       mulps xmm2, [esp + .dzH1O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H1 interaction
-       movaps xmm0, [esp + .rinvH1H1]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH1H1] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH1H1]
-       mulps xmm1, [esp + .dyH1H1]
-       mulps xmm2, [esp + .dzH1H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H2 interaction
-       movaps xmm0, [esp + .rinvH1H2]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH1H2] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH1H2]
-       mulps xmm1, [esp + .dyH1H2]
-       mulps xmm2, [esp + .dzH1H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H2-O interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH2O] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqOH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqOH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH2O]
-       mulps xmm1, [esp + .dyH2O]
-       mulps xmm2, [esp + .dzH2O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H1 interaction
-       movaps xmm0, [esp + .rinvH2H1]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH2H1] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH2H1]
-       mulps xmm1, [esp + .dyH2H1]
-       mulps xmm2, [esp + .dzH2H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H2 interaction
-       movaps xmm0, [esp + .rinvH2H2]
-       movaps xmm7, xmm0       ;  xmm7=rinv
-       movaps xmm5, [esp + .krf]       
-       movaps xmm1, xmm0
-       mulps  xmm5, [esp + .rsqH2H2] ;  xmm5=krsq
-       movaps xmm4, xmm5
-       addps  xmm4, xmm7       ; xmm4=rinv+krsq
-       subps  xmm4, [esp + .crf]
-       mulps xmm0, xmm0
-       mulps  xmm4, [esp + .qqHH] ;  xmm4=voul=qq*(rinv+krsq-crf)
-       mulps  xmm5, [esp + .two]
-       subps  xmm7, xmm5       ; xmm7=rinv-2*krsq
-       mulps  xmm7, [esp + .qqHH] ; xmm7 = coul part of fscal
-       addps  xmm6, xmm4       ; add to local vctot.
-       mulps xmm0, xmm7        ; fsOH2
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0
-
-       movaps xmm1, xmm0
-       movaps [esp + .vctot], xmm6
-       movaps xmm2, xmm0
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH2H2]
-       mulps xmm1, [esp + .dyH2H2]
-       mulps xmm2, [esp + .dzH2H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       mov edi, [ebp +%$faction]
-               
-       ; Did all interactions - now update j forces.
-       ; 4 j waters with three atoms each - first do a & b j particles
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd 
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpcklps xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjxOb  fjyOb
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOb  fjyOb
-       unpcklps xmm1, xmm2        ; xmm1= fjzOa  fjxH1a fjzOb  fjxH1b
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpcklps xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1b fjzH1b
-       unpcklps xmm5, xmm6        ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
-       movlhps  xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjzOa  fjxH1a  
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOb  fjyOb  fjzOb  fjxH1b
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
-       movups   xmm1, [edi + eax*4]
-       movups   xmm2, [edi + eax*4 + 16]
-       movups   xmm5, [edi + ebx*4]
-       movups   xmm6, [edi + ebx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + eax*4 + 32]
-       movss    xmm3, [edi + ebx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm7, xmm7, 1b
-       
-       movups   [edi + eax*4],     xmm1
-       movups   [edi + eax*4 + 16],xmm2
-       movups   [edi + ebx*4],     xmm5
-       movups   [edi + ebx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + eax*4 + 32], xmm0
-       movss    [edi + ebx*4 + 32], xmm3       
-
-       ;; then do the second pair (c & d)
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpckhps xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjxOd  fjyOd
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOd  fjyOd
-       unpckhps xmm1, xmm2        ; xmm1= fjzOc  fjxH1c fjzOd  fjxH1d
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpckhps xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d  
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1d fjzH1d         
-       unpckhps xmm5, xmm6        ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
-       movlhps  xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjzOc  fjxH1c 
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOd  fjyOd  fjzOd  fjxH1d
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c 
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
-       movups   xmm1, [edi + ecx*4]
-       movups   xmm2, [edi + ecx*4 + 16]
-       movups   xmm5, [edi + edx*4]
-       movups   xmm6, [edi + edx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + ecx*4 + 32]
-       movss    xmm3, [edi + edx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm4, xmm4, 10b
-       shufps   xmm7, xmm7, 11b
-       movups   [edi + ecx*4],     xmm1
-       movups   [edi + ecx*4 + 16],xmm2
-       movups   [edi + edx*4],     xmm5
-       movups   [edi + edx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + ecx*4 + 32], xmm0
-       movss    [edi + edx*4 + 32], xmm3       
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .single_check
-       jmp   .unroll_loop
-.single_check:
-       add   [esp + .innerk], dword 4
-       jnz   .single_loop
-       jmp   .updateouterdata
-.single_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-
-       ; fetch j coordinates
-       xorps xmm3, xmm3
-       xorps xmm4, xmm4
-       xorps xmm5, xmm5
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + eax*4 + 4]
-       movss xmm5, [esi + eax*4 + 8]
-
-       movlps xmm6, [esi + eax*4 + 12]
-       movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
-       ;;  fetch both z coords in one go, to positions 0 and 3 in xmm7
-       movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
-       shufps xmm6, xmm6, 11011000b    ;  xmm6=jxH1 jxH2 jyH1 jyH2
-       movlhps xmm3, xmm6              ; xmm3= jxO   0  jxH1 jxH2 
-       movaps  xmm0, [esp + .ixO]     
-       movaps  xmm1, [esp + .iyO]
-       movaps  xmm2, [esp + .izO]      
-       shufps  xmm4, xmm6, 11100100b ;  xmm4= jyO   0   jyH1 jyH2
-       shufps xmm5, xmm7, 11000100b  ;  xmm5= jzO   0   jzH1 jzH2  
-       ;;  store all j coordinates in jO
-       movaps [esp + .jxO], xmm3
-       movaps [esp + .jyO], xmm4
-       movaps [esp + .jzO], xmm5
-       subps  xmm0, xmm3
-       subps  xmm1, xmm4
-       subps  xmm2, xmm5
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2        ;  have rsq in xmm0.
-
-       movaps xmm6, xmm0
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       mulps   xmm6, [esp + .krf] ; xmm6=krsq
-       movaps  xmm2, xmm1
-       movaps  xmm7, xmm6         ; xmm7=krsq
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, xmm0
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2                                                      
-       mulps   xmm3, [esp + .half] ; rinv iO - j water
-
-
-       
-       addps   xmm6, xmm3      ;  xmm6=rinv+krsq
-       mulps   xmm7, [esp + .two]
-       subps   xmm6, [esp + .crf] ; xmm6=rinv+krsq-crf
-       
-       xorps   xmm1, xmm1
-       movaps  xmm0, xmm3
-       subps   xmm3, xmm7      ; xmm3=rinv-2*krsq
-       xorps   xmm4, xmm4
-       mulps   xmm0, xmm0      ; xmm0=rinvsq
-       ;; fetch charges to xmm4 (temporary)
-       movss   xmm4, [esp + .qqOO]
-       movhps  xmm4, [esp + .qqOH]
-
-       mulps xmm6, xmm4        ;  vcoul 
-       mulps xmm3, xmm4        ;  coul part of fs
-
-
-       addps   xmm6, [esp + .vctot]
-       mulps   xmm0, xmm3      ;  total fscal
-       movaps  [esp + .vctot], xmm6    
-
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxOO]
-       mulps   xmm1, [esp + .dyOO]
-       mulps   xmm2, [esp + .dzOO]
-       
-       ;; initial update for j forces
-       xorps   xmm3, xmm3
-       xorps   xmm4, xmm4
-       xorps   xmm5, xmm5
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixO]
-       addps   xmm1, [esp + .fiyO]
-       addps   xmm2, [esp + .fizO]
-       movaps  [esp + .fixO], xmm0
-       movaps  [esp + .fiyO], xmm1
-       movaps  [esp + .fizO], xmm2
-
-       
-       ;;  done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
-       movaps  xmm0, [esp + .ixH1]
-       movaps  xmm1, [esp + .iyH1]
-       movaps  xmm2, [esp + .izH1]     
-       movaps  xmm3, [esp + .ixH2] 
-       movaps  xmm4, [esp + .iyH2] 
-       movaps  xmm5, [esp + .izH2] 
-       subps   xmm0, [esp + .jxO]
-       subps   xmm1, [esp + .jyO]
-       subps   xmm2, [esp + .jzO]
-       subps   xmm3, [esp + .jxO]
-       subps   xmm4, [esp + .jyO]
-       subps   xmm5, [esp + .jzO]
-       movaps [esp + .dxH1O], xmm0
-       movaps [esp + .dyH1O], xmm1
-       movaps [esp + .dzH1O], xmm2
-       movaps [esp + .dxH2O], xmm3
-       movaps [esp + .dyH2O], xmm4
-       movaps [esp + .dzH2O], xmm5
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       mulps xmm3, xmm3
-       mulps xmm4, xmm4
-       mulps xmm5, xmm5
-       addps xmm0, xmm1
-       addps xmm4, xmm3
-       addps xmm0, xmm2        ;  have rsqH1 in xmm0.
-       addps xmm4, xmm5        ;  have rsqH2 in xmm4.
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinv H1 - j water
-       mulps   xmm7, [esp + .half] ; rinv H2 - j water
-
-       mulps xmm0, [esp + .krf] ;  krsq
-       mulps xmm4, [esp + .krf] ;  krsq 
-
-       ;; assemble charges in xmm6
-       xorps   xmm6, xmm6
-       movss   xmm6, [esp + .qqOH]
-       movhps  xmm6, [esp + .qqHH]
-       movaps  xmm1, xmm0
-       movaps  xmm5, xmm4
-       addps   xmm0, xmm3      ; krsq+rinv
-       addps   xmm4, xmm7      ; krsq+rinv
-       subps   xmm0, [esp + .crf]
-       subps   xmm4, [esp + .crf]
-       mulps   xmm1, [esp + .two]
-       mulps   xmm5, [esp + .two]
-       mulps   xmm0, xmm6      ;  vcoul
-       mulps   xmm4, xmm6      ;  vcoul
-       addps   xmm4, xmm0              
-       addps   xmm4, [esp + .vctot]
-       movaps  [esp + .vctot], xmm4
-       movaps  xmm0, xmm3
-       movaps  xmm4, xmm7
-       mulps   xmm3, xmm3
-       mulps   xmm7, xmm7
-       subps   xmm0, xmm1
-       subps   xmm4, xmm5
-       mulps   xmm0, xmm6
-       mulps   xmm4, xmm6
-       mulps   xmm0, xmm3      ;  fscal
-       mulps   xmm7, xmm4      ;  fscal
-       
-       movaps  xmm1, xmm0
-       movaps  xmm2, xmm0
-       mulps   xmm0, [esp + .dxH1O]
-       mulps   xmm1, [esp + .dyH1O]
-       mulps   xmm2, [esp + .dzH1O]
-       ;;  update forces H1 - j water
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH1]
-       addps   xmm1, [esp + .fiyH1]
-       addps   xmm2, [esp + .fizH1]
-       movaps  [esp + .fixH1], xmm0
-       movaps  [esp + .fiyH1], xmm1
-       movaps  [esp + .fizH1], xmm2
-       ;; do forces H2 - j water
-       movaps xmm0, xmm7
-       movaps xmm1, xmm7
-       movaps xmm2, xmm7
-       mulps   xmm0, [esp + .dxH2O]
-       mulps   xmm1, [esp + .dyH2O]
-       mulps   xmm2, [esp + .dzH2O]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       mov     esi, [ebp + %$faction]
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH2]
-       addps   xmm1, [esp + .fiyH2]
-       addps   xmm2, [esp + .fizH2]
-       movaps  [esp + .fixH2], xmm0
-       movaps  [esp + .fiyH2], xmm1
-       movaps  [esp + .fizH2], xmm2
-
-       ;; update j water forces from local variables
-       movlps  xmm0, [esi + eax*4]
-       movlps  xmm1, [esi + eax*4 + 12]
-       movhps  xmm1, [esi + eax*4 + 24]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       movaps  xmm6, xmm5
-       movaps  xmm7, xmm5
-       shufps  xmm6, xmm6, 10b
-       shufps  xmm7, xmm7, 11b
-       addss   xmm5, [esi + eax*4 + 8]
-       addss   xmm6, [esi + eax*4 + 20]
-       addss   xmm7, [esi + eax*4 + 32]
-       movss   [esi + eax*4 + 8], xmm5
-       movss   [esi + eax*4 + 20], xmm6
-       movss   [esi + eax*4 + 32], xmm7
-       movaps   xmm5, xmm3
-       unpcklps xmm3, xmm4
-       unpckhps xmm5, xmm4
-       addps    xmm0, xmm3
-       addps    xmm1, xmm5
-       movlps  [esi + eax*4], xmm0 
-       movlps  [esi + eax*4 + 12], xmm1 
-       movhps  [esi + eax*4 + 24], xmm1 
-       
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .single_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO] 
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
- 
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7       
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 1460
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-       
-               
-
-proc inl3000_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.two        equ   112
-.tabscale    equ   128
-.qq          equ   144 
-.fs          equ   160
-.vctot       equ   176
-.fix         equ   192
-.fiy         equ   208
-.fiz         equ   224
-.half        equ   240
-.three       equ   256
-.is3         equ   272
-.ii3         equ   276
-.innerjjnr   equ   280
-.innerk      equ   284
-.salign             equ   288                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 292            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three],  xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       mulps  xmm3, xmm2
-
-       movaps [esp + .qq], xmm3        
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-               
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov edi, [ebp + %$pos]  
-       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 292
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl3010_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg             
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg
-%$tabscale      arg
-%$VFtab         arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.two        equ   112
-.tabscale    equ   128
-.qq          equ   144 
-.fs          equ   160
-.vctot       equ   176
-.fix         equ   192
-.fiy         equ   208
-.fiz         equ   224
-.half        equ   240
-.three       equ   256
-.is3         equ   272
-.ii3         equ   276
-.shX        equ   280
-.shY         equ   284
-.shZ         equ   288
-.ntia       equ   292  
-.innerjjnr0  equ   296
-.innerk0     equ   300
-.innerjjnr   equ   304
-.innerk      equ   308
-.salign             equ   312                                                  
-.nscoul      equ   316
-.solnr      equ   320          
-       
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 324            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three],  xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale], xmm3
-
-       add   [ebp + %$nsatoms], dword 8
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-       movss [esp + .shX], xmm0
-       movss [esp + .shY], xmm1
-       movss [esp + .shZ], xmm2
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       mov   ecx, [eax]
-       add   [ebp + %$nsatoms], dword 12
-       mov   [esp + .nscoul], ecx      
-
-       ; clear vctot 
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       mov   [esp + .solnr], ebx
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-
-       mov   ecx, [esp + .nscoul]
-       cmp   ecx, dword 0
-       jnz   .mno_coul
-       jmp   .last_mno
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear  i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-
-.unroll_coul_loop:     
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       mulps  xmm3, xmm2
-
-       movaps [esp + .qq], xmm3        
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-               
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_coul
-       jmp   .checksingle_coul
-.dopair_coul:  
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov edi, [ebp + %$pos]  
-       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_coul:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_coul
-       jmp    .updateouterdata_coul
-.dosingle_coul:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_coul:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .last_mno
-       jmp .mno_coul
-       
-.last_mno:     
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 324
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl3020_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$tabscale     arg     
-%$VFtab                arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.iqO         equ   144 
-.iqH         equ   160 
-.dxO         equ   176
-.dyO         equ   192
-.dzO         equ   208 
-.dxH1        equ   224
-.dyH1        equ   240
-.dzH1        equ   256 
-.dxH2        equ   272
-.dyH2        equ   288
-.dzH2        equ   304 
-.qqO         equ   320
-.qqH         equ   336
-.rinvO       equ   352
-.rinvH1      equ   368
-.rinvH2             equ   384          
-.rO          equ   400
-.rH1         equ   416
-.rH2         equ   432
-.tabscale    equ   448 
-.two         equ   464
-.vctot       equ   480
-.fixO        equ   496
-.fiyO        equ   512
-.fizO        equ   528
-.fixH1       equ   544
-.fiyH1       equ   560
-.fizH1       equ   576
-.fixH2       equ   592
-.fiyH2       equ   608
-.fizH2       equ   624
-.fjx        equ   640
-.fjy         equ   656
-.fjz         equ   672
-.half        equ   688
-.three       equ   704
-.is3         equ   720
-.ii3         equ   724
-.innerjjnr   equ   728
-.innerk      equ   732
-.salign             equ   736                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 740            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp +%$tabscale]
-       
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two],  xmm1
-       movaps [esp + .three],  xmm2
-       shufps xmm3, xmm3, 0b 
-       movaps [esp + .tabscale], xmm3
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, [edx + ebx*4 + 4]   
-       movss xmm5, [ebp + %$facel]
-       mulss  xmm3, xmm5
-       mulss  xmm4, xmm5
-
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       movaps [esp + .iqO], xmm3
-       movaps [esp + .iqH], xmm4
-       
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .odd_inner
-.unroll_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movaps xmm4, xmm3            ;  and in xmm4
-       mulps  xmm3, [esp + .iqO]
-       mulps  xmm4, [esp + .iqH]
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps  [esp + .qqO], xmm3
-       movaps  [esp + .qqH], xmm4      
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ixO-izO to xmm4-xmm6
-       movaps xmm4, [esp + .ixO]
-       movaps xmm5, [esp + .iyO]
-       movaps xmm6, [esp + .izO]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxO], xmm4
-       movaps [esp + .dyO], xmm5
-       movaps [esp + .dzO], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       movaps xmm7, xmm4
-       ; rsqO in xmm7
-
-       ; move ixH1-izH1 to xmm4-xmm6
-       movaps xmm4, [esp + .ixH1]
-       movaps xmm5, [esp + .iyH1]
-       movaps xmm6, [esp + .izH1]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxH1], xmm4
-       movaps [esp + .dyH1], xmm5
-       movaps [esp + .dzH1], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm6, xmm5
-       addps xmm6, xmm4
-       ; rsqH1 in xmm6
-
-       ; move ixH2-izH2 to xmm3-xmm5
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-
-       ; calc dr
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-
-       ; store dr
-       movaps [esp + .dxH2], xmm3
-       movaps [esp + .dyH2], xmm4
-       movaps [esp + .dzH2], xmm5
-       ; square it
-       mulps xmm3,xmm3
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       addps xmm5, xmm4
-       addps xmm5, xmm3
-       ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
-       ; start with rsqO - seed to xmm2        
-       rsqrtps xmm2, xmm7
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm7      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvO], xmm4    ; rinvO in xmm4
-       mulps   xmm7, xmm4
-       movaps  [esp + .rO], xmm7       
-
-       ; rsqH1 - seed in xmm2  
-       rsqrtps xmm2, xmm6
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm6      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvH1], xmm4   ; rinvH1 in xmm4
-       mulps   xmm6, xmm4
-       movaps  [esp + .rH1], xmm6
-
-       ; rsqH2 - seed to xmm2  
-       rsqrtps xmm2, xmm5
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm5      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvH2], xmm4   ; rinvH2 in xmm4
-       mulps   xmm5, xmm4
-       movaps  [esp + .rH2], xmm5
-
-       ; do O interactions
-       ;; rO is still in xmm7.
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd mm0, eax   
-        movd mm1, ebx
-        movd mm2, ecx
-        movd mm3, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm0, xmm7 ; fijC=FF*qq
-
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5 
-       xorps  xmm4, xmm4
-
-       mulps  xmm0, [esp + .tabscale]
-       mulps  xmm0, [esp + .rinvO]     
-       subps  xmm4, xmm0
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4       ;  tx in xmm0-xmm2
-
-       ; update O forces
-       movaps xmm3, [esp + .fixO]
-       movaps xmm4, [esp + .fiyO]
-       movaps xmm7, [esp + .fizO]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixO], xmm3
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm7
-       ; update j forces with water O
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ;;  Done with O interactions - now H1!
-       movaps xmm7, [esp + .rH1]
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm7, xmm0 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm7 fijC.
-        ; increment vcoul
-       xorps  xmm4, xmm4
-        addps  xmm5, [esp + .vctot]
-       mulps  xmm7, [esp + .rinvH1]
-        movaps [esp + .vctot], xmm5 
-       mulps  xmm7, [esp + .tabscale]
-       subps xmm4, xmm7
-
-       movaps xmm0, [esp + .dxH1]
-       movaps xmm1, [esp + .dyH1]
-       movaps xmm2, [esp + .dzH1]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H1 forces
-       movaps xmm3, [esp + .fixH1]
-       movaps xmm4, [esp + .fiyH1]
-       movaps xmm7, [esp + .fizH1]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH1], xmm3
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm7
-       ; update j forces with water H1
-       addps  xmm0, [esp + .fjx]
-       addps  xmm1, [esp + .fjy]
-       addps  xmm2, [esp + .fjz]
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; Done with H1, finally we do H2 interactions
-       movaps xmm7, [esp + .rH2]
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm7, xmm0 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul
-       xorps  xmm4, xmm4
-        addps  xmm5, [esp + .vctot]
-       mulps  xmm7, [esp + .rinvH2]
-        movaps [esp + .vctot], xmm5 
-       mulps  xmm7, [esp + .tabscale]
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dxH2]
-       movaps xmm1, [esp + .dyH2]
-       movaps xmm2, [esp + .dzH2]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-        movd eax, mm0   
-        movd ebx, mm1
-        movd ecx, mm2
-        movd edx, mm3
-       
-       ; update H2 forces
-       movaps xmm3, [esp + .fixH2]
-       movaps xmm4, [esp + .fiyH2]
-       movaps xmm7, [esp + .fizH2]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH2], xmm3
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm7
-
-       mov edi, [ebp +%$faction]
-       ; update j forces
-       addps xmm0, [esp + .fjx]
-       addps xmm1, [esp + .fjy]
-       addps xmm2, [esp + .fjz]
-
-       movlps xmm4, [edi + eax*4]
-       movlps xmm7, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm7, [edi + edx*4]
-       
-       movaps xmm3, xmm4
-       shufps xmm3, xmm7, 10001000b
-       shufps xmm4, xmm7, 11011101b                          
-       ; xmm3 has fjx, xmm4 has fjy.
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       ; unpack the back for storing.
-       movaps xmm7, xmm3
-       unpcklps xmm7, xmm4
-       unpckhps xmm3, xmm4     
-       movlps [edi + eax*4], xmm7
-       movlps [edi + ecx*4], xmm3
-       movhps [edi + ebx*4], xmm7
-       movhps [edi + edx*4], xmm3
-       ; finally z forces 
-       movss  xmm0, [edi + eax*4 + 8]
-       movss  xmm1, [edi + ebx*4 + 8]
-       movss  xmm3, [edi + ecx*4 + 8]
-       movss  xmm4, [edi + edx*4 + 8]
-       subss  xmm0, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm1, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm3, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm4, xmm2
-       movss  [edi + eax*4 + 8], xmm0
-       movss  [edi + ebx*4 + 8], xmm1
-       movss  [edi + ecx*4 + 8], xmm3
-       movss  [edi + edx*4 + 8], xmm4
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .odd_inner
-       jmp   .unroll_loop
-.odd_inner:    
-       add   [esp + .innerk], dword 4
-       jnz   .odd_loop
-       jmp   .updateouterdata
-.odd_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       xorps xmm4, xmm4
-       movss xmm4, [esp + .iqO]
-       mov esi, [ebp + %$charge] 
-       movhps xmm4, [esp + .iqH]     
-       movss xmm3, [esi + eax*4]       ; charge in xmm3
-       shufps xmm3, xmm3, 0b
-       mulps xmm3, xmm4
-       movaps [esp + .qqO], xmm3       ; use oxygen qq for storage.
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-       
-       ; move j coords to xmm0-xmm2
-       movss xmm0, [esi + eax*4]
-       movss xmm1, [esi + eax*4 + 4]
-       movss xmm2, [esi + eax*4 + 8]
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       
-       movss xmm3, [esp + .ixO]
-       movss xmm4, [esp + .iyO]
-       movss xmm5, [esp + .izO]
-               
-       movlps xmm6, [esp + .ixH1]
-       movlps xmm7, [esp + .ixH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm3, xmm6
-       movlps xmm6, [esp + .iyH1]
-       movlps xmm7, [esp + .iyH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm4, xmm6
-       movlps xmm6, [esp + .izH1]
-       movlps xmm7, [esp + .izH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm5, xmm6
-
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       
-       movaps [esp + .dxO], xmm3
-       movaps [esp + .dyO], xmm4
-       movaps [esp + .dzO], xmm5
-
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       ; rsq in xmm4.
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       movaps [esp + .rinvO], xmm0
-       
-       mulps xmm4, [esp + .tabscale]
-       movhlps xmm7, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm7    ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm7, mm7
-        movlhps xmm3, xmm7
-
-       subps   xmm4, xmm3      
-       movaps xmm1, xmm4       ; xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-       
-        movd mm0, eax   
-        movd mm1, ecx
-        movd mm2, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm0, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-
-       xorps xmm4, xmm4
-       mulps  xmm0, [esp + .tabscale]
-       mulps  xmm0, [esp + .rinvO]     
-       subps  xmm4, xmm0
-               
-        movd eax, mm0   
-        movd ecx, mm1
-        movd edx, mm2  
-               
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4 ; xmm0-xmm2 now contains tx-tz (partial force)
-       movss  xmm3, [esp + .fixO]      
-       movss  xmm4, [esp + .fiyO]      
-       movss  xmm5, [esp + .fizO]      
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esp + .fixO], xmm3      
-       movss  [esp + .fiyO], xmm4      
-       movss  [esp + .fizO], xmm5      ; updated the O force. now do the H's
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       shufps xmm3, xmm3, 11100110b    ; shift right
-       shufps xmm4, xmm4, 11100110b
-       shufps xmm5, xmm5, 11100110b
-       addss  xmm3, [esp + .fixH1]
-       addss  xmm4, [esp + .fiyH1]
-       addss  xmm5, [esp + .fizH1]
-       movss  [esp + .fixH1], xmm3     
-       movss  [esp + .fiyH1], xmm4     
-       movss  [esp + .fizH1], xmm5     ; updated the H1 force. 
-
-       mov edi, [ebp + %$faction]
-       shufps xmm3, xmm3, 11100111b    ; shift right
-       shufps xmm4, xmm4, 11100111b
-       shufps xmm5, xmm5, 11100111b
-       addss  xmm3, [esp + .fixH2]
-       addss  xmm4, [esp + .fiyH2]
-       addss  xmm5, [esp + .fizH2]
-       movss  [esp + .fixH2], xmm3     
-       movss  [esp + .fiyH2], xmm4     
-       movss  [esp + .fizH2], xmm5     ; updated the H2 force. 
-
-       ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
-       xorps  xmm5, xmm5
-       movaps xmm3, xmm0
-       movlps xmm6, [edi + eax*4]
-       movss  xmm7, [edi + eax*4 + 8]
-       unpcklps xmm3, xmm1
-       movlhps  xmm3, xmm5     
-       unpckhps xmm0, xmm1             
-       addps    xmm0, xmm3
-       movhlps  xmm3, xmm0     
-       addps    xmm0, xmm3     ; x,y sum in xmm0
-
-       movhlps  xmm1, xmm2
-       addss    xmm2, xmm1
-       shufps   xmm1, xmm1, 1b 
-       addss    xmm2, xmm1    ; z sum in xmm2
-       subps    xmm6, xmm0
-       subss    xmm7, xmm2
-       
-       movlps [edi + eax*4],     xmm6
-       movss  [edi + eax*4 + 8], xmm7
-
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .odd_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO]
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       mov   edx, [ebp + %$gid]  
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4    
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-        
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 740
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-       
-
-       
-proc inl3030_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$tabscale     arg     
-%$VFtab                arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.jxO        equ   144
-.jyO        equ   160
-.jzO         equ   176
-.jxH1       equ   192
-.jyH1       equ   208
-.jzH1        equ   224
-.jxH2       equ   240
-.jyH2       equ   256
-.jzH2        equ   272
-.dxOO        equ   288
-.dyOO        equ   304
-.dzOO        equ   320 
-.dxOH1       equ   336
-.dyOH1       equ   352
-.dzOH1       equ   368 
-.dxOH2       equ   384
-.dyOH2       equ   400
-.dzOH2       equ   416 
-.dxH1O       equ   432
-.dyH1O       equ   448
-.dzH1O       equ   464 
-.dxH1H1      equ   480
-.dyH1H1      equ   496
-.dzH1H1      equ   512 
-.dxH1H2      equ   528
-.dyH1H2      equ   544
-.dzH1H2      equ   560 
-.dxH2O       equ   576
-.dyH2O       equ   592
-.dzH2O       equ   608 
-.dxH2H1      equ   624
-.dyH2H1      equ   640
-.dzH2H1      equ   656 
-.dxH2H2      equ   672
-.dyH2H2      equ   688
-.dzH2H2      equ   704
-.qqOO        equ   720
-.qqOH        equ   736
-.qqHH        equ   752
-.two         equ   768
-.tabscale    equ   784
-.vctot       equ   800
-.fixO        equ   816
-.fiyO        equ   832
-.fizO        equ   848
-.fixH1       equ   864
-.fiyH1       equ   880
-.fizH1       equ   896
-.fixH2       equ   912
-.fiyH2       equ   928
-.fizH2       equ   944
-.fjxO       equ   960
-.fjyO        equ   976
-.fjzO        equ   992
-.fjxH1      equ  1008
-.fjyH1       equ  1024
-.fjzH1       equ  1040
-.fjxH2      equ  1056
-.fjyH2       equ  1072
-.fjzH2       equ  1088
-.half        equ  1104
-.three       equ  1120
-.rsqOO       equ  1136
-.rsqOH1      equ  1152
-.rsqOH2      equ  1168
-.rsqH1O      equ  1184
-.rsqH1H1     equ  1200
-.rsqH1H2     equ  1216
-.rsqH2O      equ  1232
-.rsqH2H1     equ  1248
-.rsqH2H2     equ  1264
-.rinvOO      equ  1280
-.rinvOH1     equ  1296
-.rinvOH2     equ  1312
-.rinvH1O     equ  1328
-.rinvH1H1    equ  1344
-.rinvH1H2    equ  1360
-.rinvH2O     equ  1376
-.rinvH2H1    equ  1392
-.rinvH2H2    equ  1408 
-.is3         equ  1424
-.ii3         equ  1428
-.innerjjnr   equ  1432
-.innerk      equ  1436
-.salign             equ  1440                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 1444           ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp +%$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two],  xmm1
-       movaps [esp + .three], xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale],  xmm3
-
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, xmm3        
-       movss xmm5, [edx + ebx*4 + 4]   
-       movss xmm6, [ebp + %$facel]
-       mulss  xmm3, xmm3
-       mulss  xmm4, xmm5
-       mulss  xmm5, xmm5
-       mulss  xmm3, xmm6
-       mulss  xmm4, xmm6
-       mulss  xmm5, xmm6
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .qqOO], xmm3
-       movaps [esp + .qqOH], xmm4
-       movaps [esp + .qqHH], xmm5              
-
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx 
-       
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .single_check
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4] 
-       mov   ecx, [edx + 8]
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-       
-       ; move j coordinates to local temp. variables
-       movlps xmm2, [esi + eax*4]
-       movlps xmm3, [esi + eax*4 + 12]
-       movlps xmm4, [esi + eax*4 + 24]
-
-       movlps xmm5, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 12]
-       movlps xmm7, [esi + ebx*4 + 24]
-
-       movhps xmm2, [esi + ecx*4]
-       movhps xmm3, [esi + ecx*4 + 12]
-       movhps xmm4, [esi + ecx*4 + 24]
-
-       movhps xmm5, [esi + edx*4]
-       movhps xmm6, [esi + edx*4 + 12]
-       movhps xmm7, [esi + edx*4 + 24]
-
-       ;; current state:       
-       ;;  xmm2= jxOa  jyOa  jxOc  jyOc
-       ;;  xmm3= jxH1a jyH1a jxH1c jyH1c
-       ;;  xmm4= jxH2a jyH2a jxH2c jyH2c
-       ;;  xmm5= jxOb  jyOb  jxOd  jyOd
-       ;;  xmm6= jxH1b jyH1b jxH1d jyH1d
-       ;;  xmm7= jxH2b jyH2b jxH2d jyH2d
-       
-       movaps xmm0, xmm2
-       movaps xmm1, xmm3
-       unpcklps xmm0, xmm5     ; xmm0= jxOa  jxOb  jyOa  jyOb
-       unpcklps xmm1, xmm6     ; xmm1= jxH1a jxH1b jyH1a jyH1b
-       unpckhps xmm2, xmm5     ; xmm2= jxOc  jxOd  jyOc  jyOd
-       unpckhps xmm3, xmm6     ; xmm3= jxH1c jxH1d jyH1c jyH1d 
-       movaps xmm5, xmm4
-       movaps   xmm6, xmm0
-       unpcklps xmm4, xmm7     ; xmm4= jxH2a jxH2b jyH2a jyH2b         
-       unpckhps xmm5, xmm7     ; xmm5= jxH2c jxH2d jyH2c jyH2d
-       movaps   xmm7, xmm1
-       movlhps  xmm0, xmm2     ; xmm0= jxOa  jxOb  jxOc  jxOd 
-       movaps [esp + .jxO], xmm0
-       movhlps  xmm2, xmm6     ; xmm2= jyOa  jyOb  jyOc  jyOd
-       movaps [esp + .jyO], xmm2
-       movlhps  xmm1, xmm3
-       movaps [esp + .jxH1], xmm1
-       movhlps  xmm3, xmm7
-       movaps   xmm6, xmm4
-       movaps [esp + .jyH1], xmm3
-       movlhps  xmm4, xmm5
-       movaps [esp + .jxH2], xmm4
-       movhlps  xmm5, xmm6
-       movaps [esp + .jyH2], xmm5
-
-       movss  xmm0, [esi + eax*4 + 8]
-       movss  xmm1, [esi + eax*4 + 20]
-       movss  xmm2, [esi + eax*4 + 32]
-
-       movss  xmm3, [esi + ecx*4 + 8]
-       movss  xmm4, [esi + ecx*4 + 20]
-       movss  xmm5, [esi + ecx*4 + 32]
-
-       movhps xmm0, [esi + ebx*4 + 4]
-       movhps xmm1, [esi + ebx*4 + 16]
-       movhps xmm2, [esi + ebx*4 + 28]
-       
-       movhps xmm3, [esi + edx*4 + 4]
-       movhps xmm4, [esi + edx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 28]
-       
-       shufps xmm0, xmm3, 11001100b
-       shufps xmm1, xmm4, 11001100b
-       shufps xmm2, xmm5, 11001100b
-       movaps [esp + .jzO],  xmm0
-       movaps [esp + .jzH1],  xmm1
-       movaps [esp + .jzH2],  xmm2
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixO]
-       movaps xmm4, [esp + .iyO]
-       movaps xmm5, [esp + .izO]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxOH1], xmm3
-       movaps [esp + .dyOH1], xmm4
-       movaps [esp + .dzOH1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOO], xmm0
-       movaps [esp + .rsqOH1], xmm3
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       subps  xmm3, [esp + .jxO]
-       subps  xmm4, [esp + .jyO]
-       subps  xmm5, [esp + .jzO]
-       movaps [esp + .dxOH2], xmm0
-       movaps [esp + .dyOH2], xmm1
-       movaps [esp + .dzOH2], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1O], xmm3
-       movaps [esp + .dyH1O], xmm4
-       movaps [esp + .dzH1O], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOH2], xmm0
-       movaps [esp + .rsqH1O], xmm3
-
-       movaps xmm0, [esp + .ixH1]
-       movaps xmm1, [esp + .iyH1]
-       movaps xmm2, [esp + .izH1]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH1]
-       subps  xmm1, [esp + .jyH1]
-       subps  xmm2, [esp + .jzH1]
-       subps  xmm3, [esp + .jxH2]
-       subps  xmm4, [esp + .jyH2]
-       subps  xmm5, [esp + .jzH2]
-       movaps [esp + .dxH1H1], xmm0
-       movaps [esp + .dyH1H1], xmm1
-       movaps [esp + .dzH1H1], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1H2], xmm3
-       movaps [esp + .dyH1H2], xmm4
-       movaps [esp + .dzH1H2], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqH1H1], xmm0
-       movaps [esp + .rsqH1H2], xmm3
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxH2O], xmm0
-       movaps [esp + .dyH2O], xmm1
-       movaps [esp + .dzH2O], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH2H1], xmm3
-       movaps [esp + .dyH2H1], xmm4
-       movaps [esp + .dzH2H1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       movaps [esp + .rsqH2O], xmm0
-       movaps [esp + .rsqH2H1], xmm4
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       movaps [esp + .dxH2H2], xmm0
-       movaps [esp + .dyH2H2], xmm1
-       movaps [esp + .dzH2H2], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2
-       movaps [esp + .rsqH2H2], xmm0
-               
-       ; start doing invsqrt. use rsq values in xmm0, xmm4
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinvH2H2
-       mulps   xmm7, [esp + .half] ; rinvH2H1
-       movaps  [esp + .rinvH2H2], xmm3
-       movaps  [esp + .rinvH2H1], xmm7
-               
-       rsqrtps xmm1, [esp + .rsqOO]
-       rsqrtps xmm5, [esp + .rsqOH1]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOO]
-       mulps   xmm5, [esp + .rsqOH1]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOO], xmm3
-       movaps  [esp + .rinvOH1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOH2]
-       rsqrtps xmm5, [esp + .rsqH1O]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOH2]
-       mulps   xmm5, [esp + .rsqH1O]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOH2], xmm3
-       movaps  [esp + .rinvH1O], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH1H1]
-       rsqrtps xmm5, [esp + .rsqH1H2]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqH1H1]
-       mulps   xmm5, [esp + .rsqH1H2]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvH1H1], xmm3
-       movaps  [esp + .rinvH1H2], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH2O]
-       movaps  xmm2, xmm1
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, [esp + .rsqH2O]
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2
-       mulps   xmm3, [esp + .half] 
-       movaps  [esp + .rinvH2O], xmm3
-
-       ;; start with OO interaction.
-       movaps xmm0, [esp + .rinvOO]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOO] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-               
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-       
-        movd mm0, eax
-        movd mm1, ebx
-        movd mm2, ecx
-        movd mm3, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        ;; update vctot
-        addps  xmm5, [esp + .vctot]
-       xorps  xmm2, xmm2
-        movaps [esp + .vctot], xmm5
-       mulps  xmm3, [esp + .tabscale]
-       
-       subps  xmm2, xmm3
-       mulps  xmm0, xmm2
-       
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0               
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOO]
-       mulps xmm1, [esp + .dyOO]
-       mulps xmm2, [esp + .dzOO]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H1 interaction
-       movaps xmm0, [esp + .rinvOH1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOH1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-       
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH1]
-       mulps xmm1, [esp + .dyOH1]
-       mulps xmm2, [esp + .dzOH1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H2 interaction
-       movaps xmm0, [esp + .rinvOH2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOH2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH2]
-       mulps xmm1, [esp + .dyOH2]
-       mulps xmm2, [esp + .dzOH2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; H1-O interaction
-       movaps xmm0, [esp + .rinvH1O]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1O] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH1O]
-       mulps xmm1, [esp + .dyH1O]
-       mulps xmm2, [esp + .dzH1O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H1 interaction
-       movaps xmm0, [esp + .rinvH1H1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1H1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH1H1]
-       mulps xmm1, [esp + .dyH1H1]
-       mulps xmm2, [esp + .dzH1H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H2 interaction
-       movaps xmm0, [esp + .rinvH1H2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1H2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH1H2]
-       mulps xmm1, [esp + .dyH1H2]
-       mulps xmm2, [esp + .dzH1H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H2-O interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2O] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH2O]
-       mulps xmm1, [esp + .dyH2O]
-       mulps xmm2, [esp + .dzH2O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H1 interaction
-       movaps xmm0, [esp + .rinvH2H1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2H1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH2H1]
-       mulps xmm1, [esp + .dyH2H1]
-       mulps xmm2, [esp + .dzH2H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H2 interaction
-       movaps xmm0, [esp + .rinvH2H2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2H2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH2H2]
-       mulps xmm1, [esp + .dyH2H2]
-       mulps xmm2, [esp + .dzH2H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       mov edi, [ebp +%$faction]
-
-       movd eax, mm0
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-       
-       ; Did all interactions - now update j forces.
-       ; 4 j waters with three atoms each - first do a & b j particles
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd 
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpcklps xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjxOb  fjyOb
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOb  fjyOb
-       unpcklps xmm1, xmm2        ; xmm1= fjzOa  fjxH1a fjzOb  fjxH1b
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpcklps xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1b fjzH1b
-       unpcklps xmm5, xmm6        ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
-       movlhps  xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjzOa  fjxH1a  
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOb  fjyOb  fjzOb  fjxH1b
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
-       movups   xmm1, [edi + eax*4]
-       movups   xmm2, [edi + eax*4 + 16]
-       movups   xmm5, [edi + ebx*4]
-       movups   xmm6, [edi + ebx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + eax*4 + 32]
-       movss    xmm3, [edi + ebx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm7, xmm7, 1b
-       
-       movups   [edi + eax*4],     xmm1
-       movups   [edi + eax*4 + 16],xmm2
-       movups   [edi + ebx*4],     xmm5
-       movups   [edi + ebx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + eax*4 + 32], xmm0
-       movss    [edi + ebx*4 + 32], xmm3       
-
-       ;; then do the second pair (c & d)
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpckhps xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjxOd  fjyOd
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOd  fjyOd
-       unpckhps xmm1, xmm2        ; xmm1= fjzOc  fjxH1c fjzOd  fjxH1d
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpckhps xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d  
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1d fjzH1d         
-       unpckhps xmm5, xmm6        ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
-       movlhps  xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjzOc  fjxH1c 
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOd  fjyOd  fjzOd  fjxH1d
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c 
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
-       movups   xmm1, [edi + ecx*4]
-       movups   xmm2, [edi + ecx*4 + 16]
-       movups   xmm5, [edi + edx*4]
-       movups   xmm6, [edi + edx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + ecx*4 + 32]
-       movss    xmm3, [edi + edx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm4, xmm4, 10b
-       shufps   xmm7, xmm7, 11b
-       movups   [edi + ecx*4],     xmm1
-       movups   [edi + ecx*4 + 16],xmm2
-       movups   [edi + edx*4],     xmm5
-       movups   [edi + edx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + ecx*4 + 32], xmm0
-       movss    [edi + edx*4 + 32], xmm3       
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .single_check
-       jmp   .unroll_loop
-.single_check:
-       add   [esp + .innerk], dword 4
-       jnz   .single_loop
-       jmp   .updateouterdata
-.single_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-
-       ; fetch j coordinates
-       xorps xmm3, xmm3
-       xorps xmm4, xmm4
-       xorps xmm5, xmm5
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + eax*4 + 4]
-       movss xmm5, [esi + eax*4 + 8]
-
-       movlps xmm6, [esi + eax*4 + 12]
-       movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
-       ;;  fetch both z coords in one go, to positions 0 and 3 in xmm7
-       movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
-       shufps xmm6, xmm6, 11011000b    ;  xmm6=jxH1 jxH2 jyH1 jyH2
-       movlhps xmm3, xmm6              ; xmm3= jxO   0  jxH1 jxH2 
-       movaps  xmm0, [esp + .ixO]     
-       movaps  xmm1, [esp + .iyO]
-       movaps  xmm2, [esp + .izO]      
-       shufps  xmm4, xmm6, 11100100b ;  xmm4= jyO   0   jyH1 jyH2
-       shufps xmm5, xmm7, 11000100b  ;  xmm5= jzO   0   jzH1 jzH2  
-       ;;  store all j coordinates in jO
-       movaps [esp + .jxO], xmm3
-       movaps [esp + .jyO], xmm4
-       movaps [esp + .jzO], xmm5
-       subps  xmm0, xmm3
-       subps  xmm1, xmm4
-       subps  xmm2, xmm5
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2        ;  have rsq in xmm0.
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       movaps  xmm2, xmm1      
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, xmm0
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2                                                      
-       mulps   xmm3, [esp + .half] ; rinv iO - j water
-
-       movaps  xmm1, xmm3
-       mulps   xmm1, xmm0      ; xmm1=r
-       movaps  xmm0, xmm3      ; xmm0=rinv
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-       
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-       mov esi, [ebp + %$VFtab]
-       
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOO]
-       movhps  xmm3, [esp + .qqOH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-       
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm2, xmm2
-       mulps  xmm3, [esp + .tabscale]
-
-       subps  xmm2, xmm3
-       mulps  xmm0, xmm2
-       
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0                       
-
-       mulps   xmm0, [esp + .dxOO]
-       mulps   xmm1, [esp + .dyOO]
-       mulps   xmm2, [esp + .dzOO]
-       ;; initial update for j forces
-       xorps   xmm3, xmm3
-       xorps   xmm4, xmm4
-       xorps   xmm5, xmm5
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixO]
-       addps   xmm1, [esp + .fiyO]
-       addps   xmm2, [esp + .fizO]
-       movaps  [esp + .fixO], xmm0
-       movaps  [esp + .fiyO], xmm1
-       movaps  [esp + .fizO], xmm2
-
-       
-       ;;  done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
-       movaps  xmm0, [esp + .ixH1]
-       movaps  xmm1, [esp + .iyH1]
-       movaps  xmm2, [esp + .izH1]     
-       movaps  xmm3, [esp + .ixH2] 
-       movaps  xmm4, [esp + .iyH2] 
-       movaps  xmm5, [esp + .izH2] 
-       subps   xmm0, [esp + .jxO]
-       subps   xmm1, [esp + .jyO]
-       subps   xmm2, [esp + .jzO]
-       subps   xmm3, [esp + .jxO]
-       subps   xmm4, [esp + .jyO]
-       subps   xmm5, [esp + .jzO]
-       movaps [esp + .dxH1O], xmm0
-       movaps [esp + .dyH1O], xmm1
-       movaps [esp + .dzH1O], xmm2
-       movaps [esp + .dxH2O], xmm3
-       movaps [esp + .dyH2O], xmm4
-       movaps [esp + .dzH2O], xmm5
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       mulps xmm3, xmm3
-       mulps xmm4, xmm4
-       mulps xmm5, xmm5
-       addps xmm0, xmm1
-       addps xmm4, xmm3
-       addps xmm0, xmm2        ;  have rsqH1 in xmm0.
-       addps xmm4, xmm5        ;  have rsqH2 in xmm4.
-
-       ;;  start with H1, save H2 data
-       movaps [esp + .rsqH2O], xmm4
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinv H1 - j water
-       mulps   xmm7, [esp + .half] ; rinv H2 - j water
-
-       ;;  start with H1, save H2 data
-       movaps [esp + .rinvH2O], xmm7
-
-       movaps xmm1, xmm3
-       mulps  xmm1, xmm0       ;  xmm1=r
-       movaps xmm0, xmm3       ;  xmm0=rinv
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOH]
-       movhps  xmm3, [esp + .qqHH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5    
-
-        xorps  xmm1, xmm1
-
-        mulps xmm3, [esp + .tabscale]
-        mulps xmm3, xmm0
-        subps  xmm1, xmm3
-       
-       movaps  xmm0, xmm1
-       movaps  xmm2, xmm1
-       mulps   xmm0, [esp + .dxH1O]
-       mulps   xmm1, [esp + .dyH1O]
-       mulps   xmm2, [esp + .dzH1O]
-       ;;  update forces H1 - j water
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH1]
-       addps   xmm1, [esp + .fiyH1]
-       addps   xmm2, [esp + .fizH1]
-       movaps  [esp + .fixH1], xmm0
-       movaps  [esp + .fiyH1], xmm1
-       movaps  [esp + .fizH1], xmm2
-       ;; do table for H2 - j water interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, [esp + .rsqH2O]
-       mulps  xmm1, xmm0       ; xmm0=rinv, xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOH]
-       movhps  xmm3, [esp + .qqHH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5    
-
-        xorps  xmm1, xmm1
-
-        mulps xmm3, [esp + .tabscale]
-        mulps xmm3, xmm0
-        subps  xmm1, xmm3
-       
-       movaps  xmm0, xmm1
-       movaps  xmm2, xmm1
-       
-       mulps   xmm0, [esp + .dxH2O]
-       mulps   xmm1, [esp + .dyH2O]
-       mulps   xmm2, [esp + .dzH2O]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       mov     esi, [ebp + %$faction]
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH2]
-       addps   xmm1, [esp + .fiyH2]
-       addps   xmm2, [esp + .fizH2]
-       movaps  [esp + .fixH2], xmm0
-       movaps  [esp + .fiyH2], xmm1
-       movaps  [esp + .fizH2], xmm2
-
-       ;; update j water forces from local variables
-       movlps  xmm0, [esi + eax*4]
-       movlps  xmm1, [esi + eax*4 + 12]
-       movhps  xmm1, [esi + eax*4 + 24]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       movaps  xmm6, xmm5
-       movaps  xmm7, xmm5
-       shufps  xmm6, xmm6, 10b
-       shufps  xmm7, xmm7, 11b
-       addss   xmm5, [esi + eax*4 + 8]
-       addss   xmm6, [esi + eax*4 + 20]
-       addss   xmm7, [esi + eax*4 + 32]
-       movss   [esi + eax*4 + 8], xmm5
-       movss   [esi + eax*4 + 20], xmm6
-       movss   [esi + eax*4 + 32], xmm7
-       movaps   xmm5, xmm3
-       unpcklps xmm3, xmm4
-       unpckhps xmm5, xmm4
-       addps    xmm0, xmm3
-       addps    xmm1, xmm5
-       movlps  [esi + eax*4], xmm0 
-       movlps  [esi + eax*4 + 12], xmm1 
-       movhps  [esi + eax*4 + 24], xmm1 
-       
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .single_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO] 
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 1444
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-       
-
-
-
-proc inl3100_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.two        equ   112
-.six        equ   128
-.twelve             equ   144
-.tabscale    equ   160
-.qq          equ   176 
-.c6          equ   192
-.c12         equ   208
-.fs          equ   224
-.vctot       equ   240
-.vnbtot      equ   256
-.fix         equ   272
-.fiy         equ   288
-.fiz         equ   304
-.half        equ   320
-.three       equ   336
-.is3         equ   352
-.ii3         equ   356
-.ntia       equ   360  
-.innerjjnr   equ   364
-.innerk      equ   368
-.salign             equ   372
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 376            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movups xmm3, [sse_six]
-       movups xmm4, [sse_twelve]
-       movss xmm5, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three],  xmm2
-       movaps [esp + .six],  xmm3
-       movaps [esp + .twelve],  xmm4
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .tabscale], xmm5
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       mulps  xmm3, xmm2
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps [esp + .qq], xmm3
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; L-J
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm0       ; xmm4=rinvsq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm6, xmm4
-       mulps  xmm6, xmm4
-
-       movaps [esp + .vctot], xmm5 
-
-       mulps  xmm6, xmm4       ; xmm6=rinvsix
-       movaps xmm4, xmm6
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm6, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm7, [esp + .vnbtot]
-       addps  xmm7, xmm4
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm7, xmm6
-       mulps  xmm3, [esp + .tabscale]
-       mulps  xmm6, [esp + .six]
-       movaps [esp + .vnbtot], xmm7
-       subps  xmm4, xmm6
-       mulps  xmm4, xmm0
-       subps  xmm4, xmm3
-       mulps  xmm4, xmm0
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-       mov esi, [ebp + %$charge]
-        mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; L-J
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm0       ; xmm4=rinvsq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm6, xmm4
-       mulps  xmm6, xmm4
-
-       movaps [esp + .vctot], xmm5 
-
-       mulps  xmm6, xmm4       ; xmm6=rinvsix
-       movaps xmm4, xmm6
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm6, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm7, [esp + .vnbtot]
-       addps  xmm7, xmm4
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm7, xmm6
-       mulps  xmm3, [esp + .tabscale]
-       mulps  xmm6, [esp + .six]
-       movaps [esp + .vnbtot], xmm7
-       subps  xmm4, xmm6
-       mulps  xmm4, xmm0
-       subps  xmm4, xmm3
-       mulps  xmm4, xmm0
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; L-J
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm0       ; xmm4=rinvsq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm6, xmm4
-       mulps  xmm6, xmm4
-
-       movaps [esp + .vctot], xmm5 
-
-       mulps  xmm6, xmm4       ; xmm6=rinvsix
-       movaps xmm4, xmm6
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm6, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm7, [esp + .vnbtot]
-       addps  xmm7, xmm4
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm7, xmm6
-       mulps  xmm3, [esp + .tabscale]
-       mulps  xmm6, [esp + .six]
-       movaps [esp + .vnbtot], xmm7
-       subps  xmm4, xmm6
-       mulps  xmm4, xmm0
-       subps  xmm4, xmm3
-       mulps  xmm4, xmm0
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 376
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl3110_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.two        equ   112
-.tabscale    equ   128
-.qq          equ   144 
-.c6          equ   160
-.c12         equ   176
-.six        equ   192
-.twelve      equ   208
-.fs          equ   224
-.vctot       equ   240
-.vnbtot      equ   256
-.fix         equ   272
-.fiy         equ   288
-.fiz         equ   304
-.half        equ   320
-.three       equ   336
-.is3         equ   352
-.ii3         equ   356
-.shX        equ   360
-.shY         equ   364
-.shZ         equ   368
-.ntia       equ   372  
-.innerjjnr0  equ   376
-.innerk0     equ   380 
-.innerjjnr   equ   384
-.innerk      equ   388
-.salign             equ   392                                                  
-.nsvdwc      equ   396
-.nscoul      equ   400
-.nsvdw       equ   404
-.solnr      equ   408          
-       push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 412            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movups xmm3, [sse_six]
-       movups xmm4, [sse_twelve]
-       movss xmm5, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three], xmm2
-       movaps [esp + .six],  xmm3
-       movaps [esp + .twelve], xmm4
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .tabscale], xmm5
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movlps xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 8] 
-       movlps [esp + .shX], xmm0
-       movss [esp + .shZ], xmm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       mov   [esp + .solnr],  ebx
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testcoul
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       mulps  xmm3, xmm2
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps [esp + .qq], xmm3
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; L-J
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm0       ; xmm4=rinvsq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm6, xmm4
-       mulps  xmm6, xmm4
-
-       movaps [esp + .vctot], xmm5 
-
-       mulps  xmm6, xmm4       ; xmm6=rinvsix
-       movaps xmm4, xmm6
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm6, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm7, [esp + .vnbtot]
-       addps  xmm7, xmm4
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm7, xmm6
-       mulps  xmm3, [esp + .tabscale]
-       mulps  xmm6, [esp + .six]
-       movaps [esp + .vnbtot], xmm7
-       subps  xmm4, xmm6
-       mulps  xmm4, xmm0
-       subps  xmm4, xmm3
-       mulps  xmm4, xmm0
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdwc
-       jmp   .checksingle_vdwc
-.dopair_vdwc:  
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; L-J
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm0       ; xmm4=rinvsq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm6, xmm4
-       mulps  xmm6, xmm4
-
-       movaps [esp + .vctot], xmm5 
-
-       mulps  xmm6, xmm4       ; xmm6=rinvsix
-       movaps xmm4, xmm6
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm6, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm7, [esp + .vnbtot]
-       addps  xmm7, xmm4
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm7, xmm6
-       mulps  xmm3, [esp + .tabscale]
-       mulps  xmm6, [esp + .six]
-       movaps [esp + .vnbtot], xmm7
-       subps  xmm4, xmm6
-       mulps  xmm4, xmm0
-       subps  xmm4, xmm3
-       mulps  xmm4, xmm0
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       mov    edi, [ebp + %$faction]
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdwc:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdwc
-       jmp    .updateouterdata_vdwc
-.dosingle_vdwc:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-                                               
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; L-J
-       movaps xmm4, xmm0
-       mulps  xmm4, xmm0       ; xmm4=rinvsq
-
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-
-       movaps xmm6, xmm4
-       mulps  xmm6, xmm4
-
-       movaps [esp + .vctot], xmm5 
-
-       mulps  xmm6, xmm4       ; xmm6=rinvsix
-       movaps xmm4, xmm6
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm6, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm7, [esp + .vnbtot]
-       addps  xmm7, xmm4
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm7, xmm6
-       mulps  xmm3, [esp + .tabscale]
-       mulps  xmm6, [esp + .six]
-       movaps [esp + .vnbtot], xmm7
-       subps  xmm4, xmm6
-       mulps  xmm4, xmm0
-       subps  xmm4, xmm3
-       mulps  xmm4, xmm0
-
-       mov edi, [ebp +%$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdwc:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testcoul
-       jmp .mno_vdwc
-.testcoul:
-       mov  ecx, [esp + .nscoul]
-       cmp  ecx, byte 0
-       jnz  .mno_coul
-       jmp  .testvdw
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear  i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-
-.unroll_coul_loop:     
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       mulps  xmm3, xmm2
-
-       movaps [esp + .qq], xmm3        
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_coul
-       jmp   .checksingle_coul
-.dopair_coul:  
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov edi, [ebp + %$pos]  
-       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_coul:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_coul
-       jmp    .updateouterdata_coul
-.dosingle_coul:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_coul:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .testvdw
-       jmp .mno_coul
-.testvdw:
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdw
-       jmp   .checksingle_vdw
-.dopair_vdw:   
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdw:                              
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdw
-       jmp    .updateouterdata_vdw
-.dosingle_vdw:
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rcpps xmm5, xmm4
-       ; 1/x lookup seed in xmm5
-       movaps xmm0, [esp + .two]
-       mulps xmm4, xmm5
-       subps xmm0, xmm4
-       mulps xmm0, xmm5        ;  xmm0=rinvsq
-       movaps xmm4, xmm0
-       
-       movaps xmm1, xmm0
-       mulps  xmm1, xmm0
-       mulps  xmm1, xmm0       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm5, xmm2
-       subps  xmm5, xmm1       ;  vnb=vnb12-vnb6
-       addps  xmm5, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       subps  xmm2, xmm1
-       mulps  xmm4, xmm2       ; xmm4=total fscal
-       
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movaps [esp + .vnbtot], xmm5
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       mov edi, [ebp +%$faction]
-
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5
-.updateouterdata_vdw:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-       
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-.last_mno:     
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 412
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-proc inl3120_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-%$tabscale     arg     
-%$VFtab                arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.iqO         equ   144 
-.iqH         equ   160 
-.dxO         equ   176
-.dyO         equ   192
-.dzO         equ   208 
-.dxH1        equ   224
-.dyH1        equ   240
-.dzH1        equ   256 
-.dxH2        equ   272
-.dyH2        equ   288
-.dzH2        equ   304 
-.qqO         equ   320
-.qqH         equ   336
-.rinvO       equ   352
-.rinvH1      equ   368
-.rinvH2             equ   384          
-.rO          equ   400
-.rH1         equ   416
-.rH2         equ   432
-.tabscale    equ   448 
-.two         equ   464
-.c6          equ   480
-.c12         equ   496
-.six         equ   512
-.twelve      equ   528
-.vctot       equ   544
-.vnbtot      equ   560
-.fixO        equ   576
-.fiyO        equ   592
-.fizO        equ   608
-.fixH1       equ   624
-.fiyH1       equ   640
-.fizH1       equ   656
-.fixH2       equ   672
-.fiyH2       equ   688
-.fizH2       equ   704
-.fjx        equ   720
-.fjy         equ   736
-.fjz         equ   752
-.half        equ   768
-.three       equ   784
-.is3         equ   800
-.ii3         equ   804
-.ntia       equ   808  
-.innerjjnr   equ   812
-.innerk      equ   816
-.salign             equ   820                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 824            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movups xmm3, [sse_six]
-       movups xmm4, [sse_twelve]
-       movss xmm5, [ebp +%$tabscale]
-       
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two],  xmm1
-       movaps [esp + .three],  xmm2
-       movaps [esp + .six],  xmm3
-       movaps [esp + .twelve],  xmm4
-       shufps xmm5, xmm5, 0b 
-       movaps [esp + .tabscale], xmm5
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, [edx + ebx*4 + 4]   
-       movss xmm5, [ebp + %$facel]
-       mulss  xmm3, xmm5
-       mulss  xmm4, xmm5
-
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       movaps [esp + .iqO], xmm3
-       movaps [esp + .iqH], xmm4
-       
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       mov   [esp + .ntia], ecx                
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .odd_inner
-.unroll_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movaps xmm4, xmm3            ;  and in xmm4
-       mulps  xmm3, [esp + .iqO]
-       mulps  xmm4, [esp + .iqH]
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps  [esp + .qqO], xmm3
-       movaps  [esp + .qqH], xmm4
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ixO-izO to xmm4-xmm6
-       movaps xmm4, [esp + .ixO]
-       movaps xmm5, [esp + .iyO]
-       movaps xmm6, [esp + .izO]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxO], xmm4
-       movaps [esp + .dyO], xmm5
-       movaps [esp + .dzO], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       movaps xmm7, xmm4
-       ; rsqO in xmm7
-
-       ; move ixH1-izH1 to xmm4-xmm6
-       movaps xmm4, [esp + .ixH1]
-       movaps xmm5, [esp + .iyH1]
-       movaps xmm6, [esp + .izH1]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxH1], xmm4
-       movaps [esp + .dyH1], xmm5
-       movaps [esp + .dzH1], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm6, xmm5
-       addps xmm6, xmm4
-       ; rsqH1 in xmm6
-
-       ; move ixH2-izH2 to xmm3-xmm5
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-
-       ; calc dr
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-
-       ; store dr
-       movaps [esp + .dxH2], xmm3
-       movaps [esp + .dyH2], xmm4
-       movaps [esp + .dzH2], xmm5
-       ; square it
-       mulps xmm3,xmm3
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       addps xmm5, xmm4
-       addps xmm5, xmm3
-       ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
-       ; start with rsqO - seed to xmm2        
-       rsqrtps xmm2, xmm7
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm7      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvO], xmm4    ; rinvO in xmm4
-       mulps   xmm7, xmm4
-       movaps  [esp + .rO], xmm7       
-
-       ; rsqH1 - seed in xmm2  
-       rsqrtps xmm2, xmm6
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm6      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvH1], xmm4   ; rinvH1 in xmm4
-       mulps   xmm6, xmm4
-       movaps  [esp + .rH1], xmm6
-
-       ; rsqH2 - seed to xmm2  
-       rsqrtps xmm2, xmm5
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm5      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvH2], xmm4   ; rinvH2 in xmm4
-       mulps   xmm5, xmm4
-       movaps  [esp + .rH2], xmm5
-
-       ; do O interactions
-       ;; rO is still in xmm7.
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd mm0, eax   
-        movd mm1, ebx
-        movd mm2, ecx
-        movd mm3, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm0, xmm7 ; fijC=FF*qq
-
-       ; do nontable L-J
-       movaps xmm2, [esp + .rinvO]
-       mulps  xmm2, xmm2
-
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5 
-
-       movaps xmm1, xmm2
-       mulps  xmm1, xmm1
-       mulps  xmm1, xmm2       ; xmm1=rinvsix
-       movaps xmm4, xmm1
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm3, xmm4
-       subps  xmm3, xmm1       ; xmm3=vnb12-vnb6
-       mulps  xmm1, [esp + .six]
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm4, xmm1
-       addps  xmm3, [esp + .vnbtot]
-       mulps  xmm4, [esp + .rinvO]
-       mulps  xmm0, [esp + .tabscale]
-       subps  xmm4, xmm0
-       movaps [esp + .vnbtot], xmm3
-       mulps  xmm4, [esp + .rinvO]     
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4       ;  tx in xmm0-xmm2
-
-       ; update O forces
-       movaps xmm3, [esp + .fixO]
-       movaps xmm4, [esp + .fiyO]
-       movaps xmm7, [esp + .fizO]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixO], xmm3
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm7
-       ; update j forces with water O
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ;;  Done with O interactions - now H1!
-       movaps xmm7, [esp + .rH1]
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm7, xmm0 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm7 fijC.
-        ; increment vcoul
-       xorps  xmm4, xmm4
-        addps  xmm5, [esp + .vctot]
-       mulps  xmm7, [esp + .rinvH1]
-        movaps [esp + .vctot], xmm5 
-       mulps  xmm7, [esp + .tabscale]
-       subps xmm4, xmm7
-
-       movaps xmm0, [esp + .dxH1]
-       movaps xmm1, [esp + .dyH1]
-       movaps xmm2, [esp + .dzH1]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H1 forces
-       movaps xmm3, [esp + .fixH1]
-       movaps xmm4, [esp + .fiyH1]
-       movaps xmm7, [esp + .fizH1]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH1], xmm3
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm7
-       ; update j forces with water H1
-       addps  xmm0, [esp + .fjx]
-       addps  xmm1, [esp + .fjy]
-       addps  xmm2, [esp + .fjz]
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; Done with H1, finally we do H2 interactions
-       movaps xmm7, [esp + .rH2]
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm7, xmm0 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul
-       xorps  xmm4, xmm4
-        addps  xmm5, [esp + .vctot]
-       mulps  xmm7, [esp + .rinvH2]
-        movaps [esp + .vctot], xmm5 
-       mulps  xmm7, [esp + .tabscale]
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dxH2]
-       movaps xmm1, [esp + .dyH2]
-       movaps xmm2, [esp + .dzH2]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-        movd eax, mm0   
-        movd ebx, mm1
-        movd ecx, mm2
-        movd edx, mm3
-       
-       ; update H2 forces
-       movaps xmm3, [esp + .fixH2]
-       movaps xmm4, [esp + .fiyH2]
-       movaps xmm7, [esp + .fizH2]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH2], xmm3
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm7
-
-       mov edi, [ebp +%$faction]
-       ; update j forces
-       addps xmm0, [esp + .fjx]
-       addps xmm1, [esp + .fjy]
-       addps xmm2, [esp + .fjz]
-
-       movlps xmm4, [edi + eax*4]
-       movlps xmm7, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm7, [edi + edx*4]
-       
-       movaps xmm3, xmm4
-       shufps xmm3, xmm7, 10001000b
-       shufps xmm4, xmm7, 11011101b                          
-       ; xmm3 has fjx, xmm4 has fjy.
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       ; unpack the back for storing.
-       movaps xmm7, xmm3
-       unpcklps xmm7, xmm4
-       unpckhps xmm3, xmm4     
-       movlps [edi + eax*4], xmm7
-       movlps [edi + ecx*4], xmm3
-       movhps [edi + ebx*4], xmm7
-       movhps [edi + edx*4], xmm3
-       ; finally z forces 
-       movss  xmm0, [edi + eax*4 + 8]
-       movss  xmm1, [edi + ebx*4 + 8]
-       movss  xmm3, [edi + ecx*4 + 8]
-       movss  xmm4, [edi + edx*4 + 8]
-       subss  xmm0, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm1, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm3, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm4, xmm2
-       movss  [edi + eax*4 + 8], xmm0
-       movss  [edi + ebx*4 + 8], xmm1
-       movss  [edi + ecx*4 + 8], xmm3
-       movss  [edi + edx*4 + 8], xmm4
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .odd_inner
-       jmp   .unroll_loop
-.odd_inner:    
-       add   [esp + .innerk], dword 4
-       jnz   .odd_loop
-       jmp   .updateouterdata
-.odd_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       xorps xmm4, xmm4
-       movss xmm4, [esp + .iqO]
-       mov esi, [ebp + %$charge] 
-       movhps xmm4, [esp + .iqH]     
-       movss xmm3, [esi + eax*4]       ; charge in xmm3
-       shufps xmm3, xmm3, 0b
-       mulps xmm3, xmm4
-       movaps [esp + .qqO], xmm3       ; use oxygen qq for storage.
-
-       xorps xmm6, xmm6
-       mov esi, [ebp + %$type]
-       mov ebx, [esi + eax*4]
-       mov esi, [ebp + %$nbfp]
-       shl ebx, 1      
-       add ebx, [esp + .ntia]
-       movlps xmm6, [esi + ebx*4]
-       movaps xmm7, xmm6
-       shufps xmm6, xmm6, 11111100b
-       shufps xmm7, xmm7, 11111101b
-       movaps [esp + .c6], xmm6
-       movaps [esp + .c12], xmm7
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-       
-       ; move j coords to xmm0-xmm2
-       movss xmm0, [esi + eax*4]
-       movss xmm1, [esi + eax*4 + 4]
-       movss xmm2, [esi + eax*4 + 8]
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       
-       movss xmm3, [esp + .ixO]
-       movss xmm4, [esp + .iyO]
-       movss xmm5, [esp + .izO]
-               
-       movlps xmm6, [esp + .ixH1]
-       movlps xmm7, [esp + .ixH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm3, xmm6
-       movlps xmm6, [esp + .iyH1]
-       movlps xmm7, [esp + .iyH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm4, xmm6
-       movlps xmm6, [esp + .izH1]
-       movlps xmm7, [esp + .izH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm5, xmm6
-
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       
-       movaps [esp + .dxO], xmm3
-       movaps [esp + .dyO], xmm4
-       movaps [esp + .dzO], xmm5
-
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       ; rsq in xmm4.
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       movaps [esp + .rinvO], xmm0
-       
-       mulps xmm4, [esp + .tabscale]
-       movhlps xmm7, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm7    ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm7, mm7
-        movlhps xmm3, xmm7
-
-       subps   xmm4, xmm3      
-       movaps xmm1, xmm4       ; xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-       
-        movd mm0, eax   
-        movd mm1, ecx
-        movd mm2, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm0, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-
-       ; do nontable L-J
-       movaps xmm2, [esp + .rinvO]
-       mulps  xmm2, xmm2
-       movaps xmm1, xmm2
-       mulps  xmm1, xmm1
-       mulps  xmm1, xmm2       ; xmm1=rinvsix
-       movaps xmm4, xmm1
-       mulps  xmm4, xmm4       ; xmm4=rinvtwelve
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm4, [esp + .c12]
-       movaps xmm3, xmm4
-       subps  xmm3, xmm1       ; xmm3=vnb12-vnb6
-       mulps  xmm1, [esp + .six]
-       mulps  xmm4, [esp + .twelve]
-       subps  xmm4, xmm1
-       addps  xmm3, [esp + .vnbtot]
-       mulps  xmm4, [esp + .rinvO]
-       mulps  xmm0, [esp + .tabscale]
-       subps  xmm4, xmm0
-       movaps [esp + .vnbtot], xmm3
-       mulps  xmm4, [esp + .rinvO]     
-               
-        movd eax, mm0   
-        movd ecx, mm1
-        movd edx, mm2  
-               
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4 ; xmm0-xmm2 now contains tx-tz (partial force)
-       movss  xmm3, [esp + .fixO]      
-       movss  xmm4, [esp + .fiyO]      
-       movss  xmm5, [esp + .fizO]      
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esp + .fixO], xmm3      
-       movss  [esp + .fiyO], xmm4      
-       movss  [esp + .fizO], xmm5      ; updated the O force. now do the H's
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       shufps xmm3, xmm3, 11100110b    ; shift right
-       shufps xmm4, xmm4, 11100110b
-       shufps xmm5, xmm5, 11100110b
-       addss  xmm3, [esp + .fixH1]
-       addss  xmm4, [esp + .fiyH1]
-       addss  xmm5, [esp + .fizH1]
-       movss  [esp + .fixH1], xmm3     
-       movss  [esp + .fiyH1], xmm4     
-       movss  [esp + .fizH1], xmm5     ; updated the H1 force. 
-
-       mov edi, [ebp + %$faction]
-       shufps xmm3, xmm3, 11100111b    ; shift right
-       shufps xmm4, xmm4, 11100111b
-       shufps xmm5, xmm5, 11100111b
-       addss  xmm3, [esp + .fixH2]
-       addss  xmm4, [esp + .fiyH2]
-       addss  xmm5, [esp + .fizH2]
-       movss  [esp + .fixH2], xmm3     
-       movss  [esp + .fiyH2], xmm4     
-       movss  [esp + .fizH2], xmm5     ; updated the H2 force. 
-
-       ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
-       xorps  xmm5, xmm5
-       movaps xmm3, xmm0
-       movlps xmm6, [edi + eax*4]
-       movss  xmm7, [edi + eax*4 + 8]
-       unpcklps xmm3, xmm1
-       movlhps  xmm3, xmm5     
-       unpckhps xmm0, xmm1             
-       addps    xmm0, xmm3
-       movhlps  xmm3, xmm0     
-       addps    xmm0, xmm3     ; x,y sum in xmm0
-
-       movhlps  xmm1, xmm2
-       addss    xmm2, xmm1
-       shufps   xmm1, xmm1, 1b 
-       addss    xmm2, xmm1    ; z sum in xmm2
-       subps    xmm6, xmm0
-       subss    xmm7, xmm2
-       
-       movlps [edi + eax*4],     xmm6
-       movss  [edi + eax*4 + 8], xmm7
-
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .odd_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO]
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       mov   edx, [ebp + %$gid]  
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4    
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-        
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 824
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-       
-
-       
-proc inl3130_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale     arg     
-%$VFtab                arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.jxO        equ   144
-.jyO        equ   160
-.jzO         equ   176
-.jxH1       equ   192
-.jyH1       equ   208
-.jzH1        equ   224
-.jxH2       equ   240
-.jyH2       equ   256
-.jzH2        equ   272
-.dxOO        equ   288
-.dyOO        equ   304
-.dzOO        equ   320 
-.dxOH1       equ   336
-.dyOH1       equ   352
-.dzOH1       equ   368 
-.dxOH2       equ   384
-.dyOH2       equ   400
-.dzOH2       equ   416 
-.dxH1O       equ   432
-.dyH1O       equ   448
-.dzH1O       equ   464 
-.dxH1H1      equ   480
-.dyH1H1      equ   496
-.dzH1H1      equ   512 
-.dxH1H2      equ   528
-.dyH1H2      equ   544
-.dzH1H2      equ   560 
-.dxH2O       equ   576
-.dyH2O       equ   592
-.dzH2O       equ   608 
-.dxH2H1      equ   624
-.dyH2H1      equ   640
-.dzH2H1      equ   656 
-.dxH2H2      equ   672
-.dyH2H2      equ   688
-.dzH2H2      equ   704
-.qqOO        equ   720
-.qqOH        equ   736
-.qqHH        equ   752
-.two         equ   768
-.tabscale    equ   784
-.c6          equ   800
-.c12        equ   816           
-.six         equ   832
-.twelve      equ   848          
-.vctot       equ   864
-.vnbtot      equ   880
-.fixO        equ   896
-.fiyO        equ   912
-.fizO        equ   928
-.fixH1       equ   944
-.fiyH1       equ   960
-.fizH1       equ   976
-.fixH2       equ   992
-.fiyH2       equ  1008
-.fizH2       equ  1024
-.fjxO       equ  1040
-.fjyO        equ  1056
-.fjzO        equ  1072
-.fjxH1      equ  1088
-.fjyH1       equ  1104
-.fjzH1       equ  1120
-.fjxH2      equ  1136
-.fjyH2       equ  1152
-.fjzH2       equ  1168
-.half        equ  1184
-.three       equ  1200
-.rsqOO       equ  1216
-.rsqOH1      equ  1232
-.rsqOH2      equ  1248
-.rsqH1O      equ  1264
-.rsqH1H1     equ  1280
-.rsqH1H2     equ  1296
-.rsqH2O      equ  1312
-.rsqH2H1     equ  1328
-.rsqH2H2     equ  1344
-.rinvOO      equ  1360
-.rinvOH1     equ  1376
-.rinvOH2     equ  1392
-.rinvH1O     equ  1408
-.rinvH1H1    equ  1424
-.rinvH1H2    equ  1440
-.rinvH2O     equ  1456
-.rinvH2H1    equ  1472
-.rinvH2H2    equ  1488
-.fstmp      equ  1504  
-.is3         equ  1520
-.ii3         equ  1524
-.innerjjnr   equ  1528
-.innerk      equ  1532
-.salign             equ  1536                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 1540           ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movups xmm3, [sse_six]
-       movups xmm4, [sse_twelve]
-       movss xmm5, [ebp +%$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two],  xmm1
-       movaps [esp + .three], xmm2
-       movaps [esp + .six], xmm3
-       movaps [esp + .twelve], xmm4
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .tabscale],  xmm5
-
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, xmm3        
-       movss xmm5, [edx + ebx*4 + 4]   
-       movss xmm6, [ebp + %$facel]
-       mulss  xmm3, xmm3
-       mulss  xmm4, xmm5
-       mulss  xmm5, xmm5
-       mulss  xmm3, xmm6
-       mulss  xmm4, xmm6
-       mulss  xmm5, xmm6
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .qqOO], xmm3
-       movaps [esp + .qqOH], xmm4
-       movaps [esp + .qqHH], xmm5
-               
-       xorps xmm0, xmm0
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       mov   edx, ecx
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       add   edx, ecx
-       mov   eax, [ebp + %$nbfp]
-       movlps xmm0, [eax + edx*4] 
-       movaps xmm1, xmm0
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 01010101b
-       movaps [esp + .c6], xmm0
-       movaps [esp + .c12], xmm1
-
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx 
-       
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .single_check
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4] 
-       mov   ecx, [edx + 8]
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-       
-       ; move j coordinates to local temp. variables
-       movlps xmm2, [esi + eax*4]
-       movlps xmm3, [esi + eax*4 + 12]
-       movlps xmm4, [esi + eax*4 + 24]
-
-       movlps xmm5, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 12]
-       movlps xmm7, [esi + ebx*4 + 24]
-
-       movhps xmm2, [esi + ecx*4]
-       movhps xmm3, [esi + ecx*4 + 12]
-       movhps xmm4, [esi + ecx*4 + 24]
-
-       movhps xmm5, [esi + edx*4]
-       movhps xmm6, [esi + edx*4 + 12]
-       movhps xmm7, [esi + edx*4 + 24]
-
-       ;; current state:       
-       ;;  xmm2= jxOa  jyOa  jxOc  jyOc
-       ;;  xmm3= jxH1a jyH1a jxH1c jyH1c
-       ;;  xmm4= jxH2a jyH2a jxH2c jyH2c
-       ;;  xmm5= jxOb  jyOb  jxOd  jyOd
-       ;;  xmm6= jxH1b jyH1b jxH1d jyH1d
-       ;;  xmm7= jxH2b jyH2b jxH2d jyH2d
-       
-       movaps xmm0, xmm2
-       movaps xmm1, xmm3
-       unpcklps xmm0, xmm5     ; xmm0= jxOa  jxOb  jyOa  jyOb
-       unpcklps xmm1, xmm6     ; xmm1= jxH1a jxH1b jyH1a jyH1b
-       unpckhps xmm2, xmm5     ; xmm2= jxOc  jxOd  jyOc  jyOd
-       unpckhps xmm3, xmm6     ; xmm3= jxH1c jxH1d jyH1c jyH1d 
-       movaps xmm5, xmm4
-       movaps   xmm6, xmm0
-       unpcklps xmm4, xmm7     ; xmm4= jxH2a jxH2b jyH2a jyH2b         
-       unpckhps xmm5, xmm7     ; xmm5= jxH2c jxH2d jyH2c jyH2d
-       movaps   xmm7, xmm1
-       movlhps  xmm0, xmm2     ; xmm0= jxOa  jxOb  jxOc  jxOd 
-       movaps [esp + .jxO], xmm0
-       movhlps  xmm2, xmm6     ; xmm2= jyOa  jyOb  jyOc  jyOd
-       movaps [esp + .jyO], xmm2
-       movlhps  xmm1, xmm3
-       movaps [esp + .jxH1], xmm1
-       movhlps  xmm3, xmm7
-       movaps   xmm6, xmm4
-       movaps [esp + .jyH1], xmm3
-       movlhps  xmm4, xmm5
-       movaps [esp + .jxH2], xmm4
-       movhlps  xmm5, xmm6
-       movaps [esp + .jyH2], xmm5
-
-       movss  xmm0, [esi + eax*4 + 8]
-       movss  xmm1, [esi + eax*4 + 20]
-       movss  xmm2, [esi + eax*4 + 32]
-
-       movss  xmm3, [esi + ecx*4 + 8]
-       movss  xmm4, [esi + ecx*4 + 20]
-       movss  xmm5, [esi + ecx*4 + 32]
-
-       movhps xmm0, [esi + ebx*4 + 4]
-       movhps xmm1, [esi + ebx*4 + 16]
-       movhps xmm2, [esi + ebx*4 + 28]
-       
-       movhps xmm3, [esi + edx*4 + 4]
-       movhps xmm4, [esi + edx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 28]
-       
-       shufps xmm0, xmm3, 11001100b
-       shufps xmm1, xmm4, 11001100b
-       shufps xmm2, xmm5, 11001100b
-       movaps [esp + .jzO],  xmm0
-       movaps [esp + .jzH1],  xmm1
-       movaps [esp + .jzH2],  xmm2
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixO]
-       movaps xmm4, [esp + .iyO]
-       movaps xmm5, [esp + .izO]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxOH1], xmm3
-       movaps [esp + .dyOH1], xmm4
-       movaps [esp + .dzOH1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOO], xmm0
-       movaps [esp + .rsqOH1], xmm3
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       subps  xmm3, [esp + .jxO]
-       subps  xmm4, [esp + .jyO]
-       subps  xmm5, [esp + .jzO]
-       movaps [esp + .dxOH2], xmm0
-       movaps [esp + .dyOH2], xmm1
-       movaps [esp + .dzOH2], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1O], xmm3
-       movaps [esp + .dyH1O], xmm4
-       movaps [esp + .dzH1O], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOH2], xmm0
-       movaps [esp + .rsqH1O], xmm3
-
-       movaps xmm0, [esp + .ixH1]
-       movaps xmm1, [esp + .iyH1]
-       movaps xmm2, [esp + .izH1]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH1]
-       subps  xmm1, [esp + .jyH1]
-       subps  xmm2, [esp + .jzH1]
-       subps  xmm3, [esp + .jxH2]
-       subps  xmm4, [esp + .jyH2]
-       subps  xmm5, [esp + .jzH2]
-       movaps [esp + .dxH1H1], xmm0
-       movaps [esp + .dyH1H1], xmm1
-       movaps [esp + .dzH1H1], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1H2], xmm3
-       movaps [esp + .dyH1H2], xmm4
-       movaps [esp + .dzH1H2], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqH1H1], xmm0
-       movaps [esp + .rsqH1H2], xmm3
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxH2O], xmm0
-       movaps [esp + .dyH2O], xmm1
-       movaps [esp + .dzH2O], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH2H1], xmm3
-       movaps [esp + .dyH2H1], xmm4
-       movaps [esp + .dzH2H1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       movaps [esp + .rsqH2O], xmm0
-       movaps [esp + .rsqH2H1], xmm4
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       movaps [esp + .dxH2H2], xmm0
-       movaps [esp + .dyH2H2], xmm1
-       movaps [esp + .dzH2H2], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2
-       movaps [esp + .rsqH2H2], xmm0
-               
-       ; start doing invsqrt. use rsq values in xmm0, xmm4
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinvH2H2
-       mulps   xmm7, [esp + .half] ; rinvH2H1
-       movaps  [esp + .rinvH2H2], xmm3
-       movaps  [esp + .rinvH2H1], xmm7
-               
-       rsqrtps xmm1, [esp + .rsqOO]
-       rsqrtps xmm5, [esp + .rsqOH1]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOO]
-       mulps   xmm5, [esp + .rsqOH1]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOO], xmm3
-       movaps  [esp + .rinvOH1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOH2]
-       rsqrtps xmm5, [esp + .rsqH1O]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOH2]
-       mulps   xmm5, [esp + .rsqH1O]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOH2], xmm3
-       movaps  [esp + .rinvH1O], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH1H1]
-       rsqrtps xmm5, [esp + .rsqH1H2]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqH1H1]
-       mulps   xmm5, [esp + .rsqH1H2]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvH1H1], xmm3
-       movaps  [esp + .rinvH1H2], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH2O]
-       movaps  xmm2, xmm1
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, [esp + .rsqH2O]
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2
-       mulps   xmm3, [esp + .half] 
-       movaps  [esp + .rinvH2O], xmm3
-
-       ;; start with OO interaction.
-       movaps xmm0, [esp + .rinvOO]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOO] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-               
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-       
-        movd mm0, eax
-        movd mm1, ebx
-        movd mm2, ecx
-        movd mm3, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        ;; update vctot
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       mulps  xmm3, [esp + .tabscale]
-       
-       ;;  start doing lj
-       movaps xmm2, xmm0
-       mulps  xmm2, xmm2
-       movaps xmm1, xmm2
-       mulps  xmm1, xmm2
-       mulps  xmm1, xmm2       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulps  xmm1, [esp + .c6]
-       mulps  xmm2, [esp + .c12]
-       movaps xmm4, xmm2
-       subps  xmm4, xmm1
-       addps  xmm4, [esp + .vnbtot]
-       mulps  xmm1, [esp + .six]
-       mulps  xmm2, [esp + .twelve]
-       movaps [esp + .vnbtot], xmm4
-       subps  xmm2, xmm1
-       mulps  xmm2, xmm0
-
-       subps  xmm2, xmm3
-       mulps  xmm0, xmm2
-       
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0               
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOO]
-       mulps xmm1, [esp + .dyOO]
-       mulps xmm2, [esp + .dzOO]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H1 interaction
-       movaps xmm0, [esp + .rinvOH1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOH1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-       
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH1]
-       mulps xmm1, [esp + .dyOH1]
-       mulps xmm2, [esp + .dzOH1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H2 interaction
-       movaps xmm0, [esp + .rinvOH2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOH2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH2]
-       mulps xmm1, [esp + .dyOH2]
-       mulps xmm2, [esp + .dzOH2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; H1-O interaction
-       movaps xmm0, [esp + .rinvH1O]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1O] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH1O]
-       mulps xmm1, [esp + .dyH1O]
-       mulps xmm2, [esp + .dzH1O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H1 interaction
-       movaps xmm0, [esp + .rinvH1H1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1H1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH1H1]
-       mulps xmm1, [esp + .dyH1H1]
-       mulps xmm2, [esp + .dzH1H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H2 interaction
-       movaps xmm0, [esp + .rinvH1H2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1H2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH1H2]
-       mulps xmm1, [esp + .dyH1H2]
-       mulps xmm2, [esp + .dzH1H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H2-O interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2O] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH2O]
-       mulps xmm1, [esp + .dyH2O]
-       mulps xmm2, [esp + .dzH2O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H1 interaction
-       movaps xmm0, [esp + .rinvH2H1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2H1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH2H1]
-       mulps xmm1, [esp + .dyH2H1]
-       mulps xmm2, [esp + .dzH2H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H2 interaction
-       movaps xmm0, [esp + .rinvH2H2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2H2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH2H2]
-       mulps xmm1, [esp + .dyH2H2]
-       mulps xmm2, [esp + .dzH2H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       mov edi, [ebp +%$faction]
-
-       movd eax, mm0
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-       
-       ; Did all interactions - now update j forces.
-       ; 4 j waters with three atoms each - first do a & b j particles
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd 
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpcklps xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjxOb  fjyOb
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOb  fjyOb
-       unpcklps xmm1, xmm2        ; xmm1= fjzOa  fjxH1a fjzOb  fjxH1b
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpcklps xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1b fjzH1b
-       unpcklps xmm5, xmm6        ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
-       movlhps  xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjzOa  fjxH1a  
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOb  fjyOb  fjzOb  fjxH1b
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
-       movups   xmm1, [edi + eax*4]
-       movups   xmm2, [edi + eax*4 + 16]
-       movups   xmm5, [edi + ebx*4]
-       movups   xmm6, [edi + ebx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + eax*4 + 32]
-       movss    xmm3, [edi + ebx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm7, xmm7, 1b
-       
-       movups   [edi + eax*4],     xmm1
-       movups   [edi + eax*4 + 16],xmm2
-       movups   [edi + ebx*4],     xmm5
-       movups   [edi + ebx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + eax*4 + 32], xmm0
-       movss    [edi + ebx*4 + 32], xmm3       
-
-       ;; then do the second pair (c & d)
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpckhps xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjxOd  fjyOd
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOd  fjyOd
-       unpckhps xmm1, xmm2        ; xmm1= fjzOc  fjxH1c fjzOd  fjxH1d
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpckhps xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d  
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1d fjzH1d         
-       unpckhps xmm5, xmm6        ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
-       movlhps  xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjzOc  fjxH1c 
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOd  fjyOd  fjzOd  fjxH1d
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c 
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
-       movups   xmm1, [edi + ecx*4]
-       movups   xmm2, [edi + ecx*4 + 16]
-       movups   xmm5, [edi + edx*4]
-       movups   xmm6, [edi + edx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + ecx*4 + 32]
-       movss    xmm3, [edi + edx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm4, xmm4, 10b
-       shufps   xmm7, xmm7, 11b
-       movups   [edi + ecx*4],     xmm1
-       movups   [edi + ecx*4 + 16],xmm2
-       movups   [edi + edx*4],     xmm5
-       movups   [edi + edx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + ecx*4 + 32], xmm0
-       movss    [edi + edx*4 + 32], xmm3       
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .single_check
-       jmp   .unroll_loop
-.single_check:
-       add   [esp + .innerk], dword 4
-       jnz   .single_loop
-       jmp   .updateouterdata
-.single_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-
-       ; fetch j coordinates
-       xorps xmm3, xmm3
-       xorps xmm4, xmm4
-       xorps xmm5, xmm5
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + eax*4 + 4]
-       movss xmm5, [esi + eax*4 + 8]
-
-       movlps xmm6, [esi + eax*4 + 12]
-       movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
-       ;;  fetch both z coords in one go, to positions 0 and 3 in xmm7
-       movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
-       shufps xmm6, xmm6, 11011000b    ;  xmm6=jxH1 jxH2 jyH1 jyH2
-       movlhps xmm3, xmm6              ; xmm3= jxO   0  jxH1 jxH2 
-       movaps  xmm0, [esp + .ixO]     
-       movaps  xmm1, [esp + .iyO]
-       movaps  xmm2, [esp + .izO]      
-       shufps  xmm4, xmm6, 11100100b ;  xmm4= jyO   0   jyH1 jyH2
-       shufps xmm5, xmm7, 11000100b  ;  xmm5= jzO   0   jzH1 jzH2  
-       ;;  store all j coordinates in jO
-       movaps [esp + .jxO], xmm3
-       movaps [esp + .jyO], xmm4
-       movaps [esp + .jzO], xmm5
-       subps  xmm0, xmm3
-       subps  xmm1, xmm4
-       subps  xmm2, xmm5
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2        ;  have rsq in xmm0.
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       movaps  xmm2, xmm1      
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, xmm0
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2                                                      
-       mulps   xmm3, [esp + .half] ; rinv iO - j water
-
-       movaps  xmm1, xmm3
-       mulps   xmm1, xmm0      ; xmm1=r
-       movaps  xmm0, xmm3      ; xmm0=rinv
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-       
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-       mov esi, [ebp + %$VFtab]
-       
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOO]
-       movhps  xmm3, [esp + .qqOH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-       
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-
-       mulps  xmm3, [esp + .tabscale]
-       
-       ;;  start doing lj
-       xorps  xmm2, xmm2
-       movss  xmm2, xmm0
-       mulss  xmm2, xmm2
-       movaps xmm1, xmm2
-       mulss  xmm1, xmm2
-       mulss  xmm1, xmm2       ;  xmm1=rinvsix
-       movaps xmm2, xmm1
-       mulss  xmm2, xmm2       ;  xmm2=rinvtwelve
-       mulss  xmm1, [esp + .c6]
-       mulss  xmm2, [esp + .c12]
-       movaps xmm4, xmm2
-       subss  xmm4, xmm1
-       addps  xmm4, [esp + .vnbtot]
-       mulss  xmm1, [esp + .six]
-       mulss  xmm2, [esp + .twelve]
-       movaps [esp + .vnbtot], xmm4
-       subss  xmm2, xmm1
-       mulss  xmm2, xmm0
-
-       subps  xmm2, xmm3
-       mulps  xmm0, xmm2
-       
-       movaps xmm1, xmm0
-       movaps xmm2, xmm0                       
-
-       mulps   xmm0, [esp + .dxOO]
-       mulps   xmm1, [esp + .dyOO]
-       mulps   xmm2, [esp + .dzOO]
-       ;; initial update for j forces
-       xorps   xmm3, xmm3
-       xorps   xmm4, xmm4
-       xorps   xmm5, xmm5
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixO]
-       addps   xmm1, [esp + .fiyO]
-       addps   xmm2, [esp + .fizO]
-       movaps  [esp + .fixO], xmm0
-       movaps  [esp + .fiyO], xmm1
-       movaps  [esp + .fizO], xmm2
-
-       
-       ;;  done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
-       movaps  xmm0, [esp + .ixH1]
-       movaps  xmm1, [esp + .iyH1]
-       movaps  xmm2, [esp + .izH1]     
-       movaps  xmm3, [esp + .ixH2] 
-       movaps  xmm4, [esp + .iyH2] 
-       movaps  xmm5, [esp + .izH2] 
-       subps   xmm0, [esp + .jxO]
-       subps   xmm1, [esp + .jyO]
-       subps   xmm2, [esp + .jzO]
-       subps   xmm3, [esp + .jxO]
-       subps   xmm4, [esp + .jyO]
-       subps   xmm5, [esp + .jzO]
-       movaps [esp + .dxH1O], xmm0
-       movaps [esp + .dyH1O], xmm1
-       movaps [esp + .dzH1O], xmm2
-       movaps [esp + .dxH2O], xmm3
-       movaps [esp + .dyH2O], xmm4
-       movaps [esp + .dzH2O], xmm5
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       mulps xmm3, xmm3
-       mulps xmm4, xmm4
-       mulps xmm5, xmm5
-       addps xmm0, xmm1
-       addps xmm4, xmm3
-       addps xmm0, xmm2        ;  have rsqH1 in xmm0.
-       addps xmm4, xmm5        ;  have rsqH2 in xmm4.
-
-       ;;  start with H1, save H2 data
-       movaps [esp + .rsqH2O], xmm4
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinv H1 - j water
-       mulps   xmm7, [esp + .half] ; rinv H2 - j water
-
-       ;;  start with H1, save H2 data
-       movaps [esp + .rinvH2O], xmm7
-
-       movaps xmm1, xmm3
-       mulps  xmm1, xmm0       ;  xmm1=r
-       movaps xmm0, xmm3       ;  xmm0=rinv
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOH]
-       movhps  xmm3, [esp + .qqHH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5    
-
-        xorps  xmm1, xmm1
-
-        mulps xmm3, [esp + .tabscale]
-        mulps xmm3, xmm0
-        subps  xmm1, xmm3
-       
-       movaps  xmm0, xmm1
-       movaps  xmm2, xmm1
-       mulps   xmm0, [esp + .dxH1O]
-       mulps   xmm1, [esp + .dyH1O]
-       mulps   xmm2, [esp + .dzH1O]
-       ;;  update forces H1 - j water
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH1]
-       addps   xmm1, [esp + .fiyH1]
-       addps   xmm2, [esp + .fizH1]
-       movaps  [esp + .fixH1], xmm0
-       movaps  [esp + .fiyH1], xmm1
-       movaps  [esp + .fizH1], xmm2
-       ;; do table for H2 - j water interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, [esp + .rsqH2O]
-       mulps  xmm1, xmm0       ; xmm0=rinv, xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-       pslld   mm6, 2
-       pslld   mm7, 2
-
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOH]
-       movhps  xmm3, [esp + .qqHH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5    
-
-        xorps  xmm1, xmm1
-
-        mulps xmm3, [esp + .tabscale]
-        mulps xmm3, xmm0
-        subps  xmm1, xmm3
-       
-       movaps  xmm0, xmm1
-       movaps  xmm2, xmm1
-       
-       mulps   xmm0, [esp + .dxH2O]
-       mulps   xmm1, [esp + .dyH2O]
-       mulps   xmm2, [esp + .dzH2O]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       mov     esi, [ebp + %$faction]
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH2]
-       addps   xmm1, [esp + .fiyH2]
-       addps   xmm2, [esp + .fizH2]
-       movaps  [esp + .fixH2], xmm0
-       movaps  [esp + .fiyH2], xmm1
-       movaps  [esp + .fizH2], xmm2
-
-       ;; update j water forces from local variables
-       movlps  xmm0, [esi + eax*4]
-       movlps  xmm1, [esi + eax*4 + 12]
-       movhps  xmm1, [esi + eax*4 + 24]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       movaps  xmm6, xmm5
-       movaps  xmm7, xmm5
-       shufps  xmm6, xmm6, 10b
-       shufps  xmm7, xmm7, 11b
-       addss   xmm5, [esi + eax*4 + 8]
-       addss   xmm6, [esi + eax*4 + 20]
-       addss   xmm7, [esi + eax*4 + 32]
-       movss   [esi + eax*4 + 8], xmm5
-       movss   [esi + eax*4 + 20], xmm6
-       movss   [esi + eax*4 + 32], xmm7
-       movaps   xmm5, xmm3
-       unpcklps xmm3, xmm4
-       unpckhps xmm5, xmm4
-       addps    xmm0, xmm3
-       addps    xmm1, xmm5
-       movlps  [esi + eax*4], xmm0 
-       movlps  [esi + eax*4 + 12], xmm1 
-       movhps  [esi + eax*4 + 24], xmm1 
-       
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .single_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO] 
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 1540
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-       
-
-
-proc inl3300_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.two        equ   112
-.tabscale    equ   128
-.qq          equ   144 
-.c6          equ   160
-.c12         equ   176
-.fs          equ   192
-.vctot       equ   208
-.vnbtot      equ   224
-.fix         equ   240
-.fiy         equ   256
-.fiz         equ   272
-.half        equ   288
-.three       equ   304
-.is3         equ   320
-.ii3         equ   324
-.ntia       equ   328  
-.innerjjnr   equ   332
-.innerk      equ   336
-.salign             equ   340                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 344            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three],  xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .finish_inner
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       mulps  xmm3, xmm2
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps [esp + .qq], xmm3
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       lea   ecx, [ecx + ecx*2]
-       lea   edx, [edx + edx*2]
-               
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       ; put scalar force on stack temporarily...
-       movaps [esp + .fs], xmm3
-
-       ; dispersion
-       movlps xmm5, [esi + eax*4 + 16]
-       movlps xmm7, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + ebx*4 + 16]
-       movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + eax*4 + 24]
-       movlps xmm3, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + ebx*4 + 24]
-       movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-       addps  xmm7, [esp + .fs] ; add to fscal
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + eax*4 + 32]
-       movlps xmm7, [esi + ecx*4 + 32]
-       movhps xmm5, [esi + ebx*4 + 32]
-       movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 40]
-       movlps xmm3, [esi + ecx*4 + 40]
-       movhps xmm7, [esi + ebx*4 + 40]
-       movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_inner
-       jmp   .unroll_loop
-.finish_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair
-       jmp   .checksingle
-.dopair:       
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-       lea   ecx, [ecx + ecx*2]
-       lea   edx, [edx + edx*2]
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       ; put scalar force on stack temporarily...
-       movaps [esp + .fs], xmm3
-
-       ; dispersion
-       movlps xmm5, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 16]; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + edx*4 + 24] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-       addps  xmm7, [esp + .fs] ; add to fscal
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + ecx*4 + 32]
-       movhps xmm5, [esi + edx*4 + 32] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + ecx*4 + 40]
-       movhps xmm7, [esi + edx*4 + 40] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle:                          
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle
-       jmp    .updateouterdata
-.dosingle:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       lea  ebx, [ebx + ebx*2]
-                                               
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       ; put scalar force on stack temporarily...
-       movaps [esp + .fs], xmm3
-
-       ; dispersion
-       movlps xmm4, [esi + ebx*4 + 16]
-       movlps xmm6, [esi + ebx*4 + 24]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-       addps  xmm7, [esp + .fs] ; add to fscal
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm4, [esi + ebx*4 + 32]
-       movlps xmm6, [esi + ebx*4 + 40]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 344
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-
-
-proc inl3310_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale      arg
-%$VFtab         arg
-%$nsatoms       arg                    
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ix         equ     0
-.iy         equ    16
-.iz          equ    32
-.iq          equ    48
-.dx          equ    64
-.dy          equ    80
-.dz          equ    96
-.two        equ   112
-.tabscale    equ   128
-.qq          equ   144 
-.c6          equ   160
-.c12         equ   176
-.fs          equ   192
-.vctot       equ   208
-.vnbtot      equ   224
-.fix         equ   240
-.fiy         equ   256
-.fiz         equ   272
-.half        equ   288
-.three       equ   304
-.is3         equ   320
-.ii3         equ   324
-.shX        equ   328
-.shY         equ   332
-.shZ         equ   336
-.ntia       equ   340  
-.innerjjnr0  equ   344
-.innerk0     equ   348 
-.innerjjnr   equ   352
-.innerk      equ   356
-.salign             equ   360                                                  
-.nsvdwc      equ   364
-.nscoul      equ   368
-.nsvdw       equ   372
-.solnr      equ   376          
-       push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 380            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp + %$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two], xmm1
-       movaps [esp + .three], xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale], xmm3
-
-       ;; assume we have at least one i particle - start directly      
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movlps xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 8] 
-       movlps [esp + .shX], xmm0
-       movss [esp + .shZ], xmm1
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   eax, [ebp + %$nsatoms]
-       add   [ebp + %$nsatoms], dword 12
-       mov   ecx, [eax]        
-       mov   edx, [eax + 4]
-       mov   eax, [eax + 8]    
-       sub   ecx, eax
-       sub   eax, edx
-       
-       mov   [esp + .nsvdwc], edx
-       mov   [esp + .nscoul], eax
-       mov   [esp + .nsvdw], ecx
-               
-       ;; clear potential
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       mov   [esp + .solnr],  ebx
-
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr0], eax     ;  pointer to jjnr[nj0]
-       mov   [esp + .innerk0], edx        ;  number of innerloop atoms
-
-       mov   ecx, [esp + .nsvdwc]
-       cmp   ecx, dword 0
-       jnz   .mno_vdwc
-       jmp   .testcoul
-.mno_vdwc:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-       
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdwc_loop
-       jmp   .finish_vdwc_inner
-.unroll_vdwc_loop:     
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       mulps  xmm3, xmm2
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps [esp + .qq], xmm3
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       lea   ecx, [ecx + ecx*2]
-       lea   edx, [edx + edx*2]
-               
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       ; put scalar force on stack temporarily...
-       movaps [esp + .fs], xmm3
-
-       ; dispersion
-       movlps xmm5, [esi + eax*4 + 16]
-       movlps xmm7, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + ebx*4 + 16]
-       movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + eax*4 + 24]
-       movlps xmm3, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + ebx*4 + 24]
-       movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-       addps  xmm7, [esp + .fs] ; add to fscal
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + eax*4 + 32]
-       movlps xmm7, [esi + ecx*4 + 32]
-       movhps xmm5, [esi + ebx*4 + 32]
-       movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 40]
-       movlps xmm3, [esi + ecx*4 + 40]
-       movhps xmm7, [esi + ebx*4 + 40]
-       movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdwc_inner
-       jmp   .unroll_vdwc_loop
-.finish_vdwc_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdwc
-       jmp   .checksingle_vdwc
-.dopair_vdwc:  
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-       lea   ecx, [ecx + ecx*2]
-       lea   edx, [edx + edx*2]
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       ; put scalar force on stack temporarily...
-       movaps [esp + .fs], xmm3
-
-       ; dispersion
-       movlps xmm5, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 16]; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + edx*4 + 24] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-       addps  xmm7, [esp + .fs] ; add to fscal
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + ecx*4 + 32]
-       movhps xmm5, [esi + edx*4 + 32] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-
-       movlps xmm7, [esi + ecx*4 + 40]
-       movhps xmm7, [esi + edx*4 + 40] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdwc:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdwc
-       jmp    .updateouterdata_vdwc
-.dosingle_vdwc:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       lea  ebx, [ebx + ebx*2]
-                                               
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       ; put scalar force on stack temporarily...
-       movaps [esp + .fs], xmm3
-
-       ; dispersion
-       movlps xmm4, [esi + ebx*4 + 16]
-       movlps xmm6, [esi + ebx*4 + 24]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-       addps  xmm7, [esp + .fs] ; add to fscal
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm4, [esi + ebx*4 + 32]
-       movlps xmm6, [esi + ebx*4 + 40]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdwc:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdwc]
-       jz  .testcoul
-       jmp .mno_vdwc
-.testcoul:
-       mov  ecx, [esp + .nscoul]
-       cmp  ecx, byte 0
-       jnz  .mno_coul
-       jmp  .testvdw
-.mno_coul:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       mulss xmm3, [ebp + %$facel]
-       shufps xmm3, xmm3, 0b
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-
-       movaps [esp + .iq], xmm3
-       
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   [esp + .ii3], ebx
-       
-       ; clear  i forces
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_coul_loop
-       jmp   .finish_coul_inner
-
-.unroll_coul_loop:     
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       movaps xmm2, [esp + .iq]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       mulps  xmm3, xmm2
-
-       movaps [esp + .qq], xmm3        
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       lea   ecx, [ecx + ecx*2]
-       lea   edx, [edx + edx*2]
-               
-       movlps xmm5, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm5, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; coulomb table ready, in xmm4-xmm7     
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_coul_inner
-       jmp   .unroll_coul_loop
-.finish_coul_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_coul
-       jmp   .checksingle_coul
-.dopair_coul:  
-       mov esi, [ebp + %$charge]
-
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-       movss xmm3, [esi + eax*4]               
-       movss xmm6, [esi + ebx*4]
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm3, xmm3, 00001000b ; xmm3(0,1) has the charges.
-
-       mulps  xmm3, [esp + .iq]
-       movlhps xmm3, xmm7
-       movaps [esp + .qq], xmm3
-
-       mov edi, [ebp + %$pos]  
-       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       lea   ecx, [ecx + ecx*2]
-       lea   edx, [edx + edx*2]
-
-       movlps xmm5, [esi + ecx*4]
-       movhps xmm5, [esi + edx*4] ; got half coulomb table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8]
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b 
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps  xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_coul:                             
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_coul
-       jmp    .updateouterdata_coul
-.dosingle_coul:
-       mov esi, [ebp + %$charge]
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-       movss xmm6, [esi + eax*4]       ; xmm6(0) has the charge        
-       mulps  xmm6, [esp + .iq]
-       movaps [esp + .qq], xmm6
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-       
-       lea   ebx, [ebx + ebx*2]
-
-       movlps xmm4, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       movaps xmm3, [esp + .qq]
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       mulps  xmm5, xmm3 ; vcoul=qq*VV
-       mulps  xmm3, xmm7 ; fijC=FF*qq
-       ; at this point mm5 contains vcoul and mm3 fijC.
-       ; increment vcoul - then we can get rid of mm5.
-       ;; update vctot
-       addps  xmm5, [esp + .vctot]
-       movaps [esp + .vctot], xmm5 
-
-       xorps xmm4, xmm4
-
-       mulps xmm3, [esp + .tabscale]
-       mulps xmm3, xmm0
-       subps  xmm4, xmm3
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_coul:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-
-       ;;  loop back to mno.
-       dec dword [esp + .nscoul]
-       jz  .testvdw
-       jmp .mno_coul
-.testvdw:
-       mov  ecx, [esp + .nsvdw]
-       cmp  ecx, byte 0
-       jnz  .mno_vdw
-       jmp  .last_mno
-.mno_vdw:
-       mov   ebx,  [esp + .solnr]
-       inc   dword [esp + .solnr]
-
-        mov   edx, [ebp + %$type] 
-        mov   edx, [edx + ebx*4]
-        imul  edx, [ebp + %$ntype]
-        shl   edx, 1
-        mov   [esp + .ntia], edx
-               
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       movss xmm0, [esp + .shX]
-       movss xmm1, [esp + .shY]
-       movss xmm2, [esp + .shZ]
-
-       addss xmm0, [eax + ebx*4]
-       addss xmm1, [eax + ebx*4 + 4]
-       addss xmm2, [eax + ebx*4 + 8]
-       
-       xorps xmm4, xmm4
-       movaps [esp + .fix], xmm4
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm4
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-
-       movaps [esp + .ix], xmm0
-       movaps [esp + .iy], xmm1
-       movaps [esp + .iz], xmm2
-
-       mov   ecx, [esp + .innerjjnr0]
-       mov   [esp + .innerjjnr], ecx
-       mov   edx, [esp + .innerk0]
-        sub   edx, dword 4
-        mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_vdw_loop
-       jmp   .finish_vdw_inner
-.unroll_vdw_loop:      
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-       
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ix-iz to xmm4-xmm6
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       movhlps xmm5, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm5     ; mm6/mm7 contain lu indices
-       cvtpi2ps xmm6, mm6
-       cvtpi2ps xmm5, mm7
-       movlhps xmm6, xmm5
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-       pslld mm6, 2
-       pslld mm7, 2
-
-       movd mm0, eax   
-       movd mm1, ebx
-       movd mm2, ecx
-       movd mm3, edx
-
-       mov  esi, [ebp + %$VFtab]
-       movd eax, mm6
-       psrlq mm6, 32
-       movd ecx, mm7
-       psrlq mm7, 32
-       movd ebx, mm6
-       movd edx, mm7
-
-       lea   eax, [eax + eax*2] 
-       lea   ebx, [ebx + ebx*2] 
-       lea   ecx, [ecx + ecx*2] 
-       lea   edx, [edx + edx*2] 
-
-       ; dispersion
-       movlps xmm5, [esi + eax*4 + 0]
-       movlps xmm7, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + ebx*4 + 0]
-       movhps xmm7, [esi + edx*4 + 0] ; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-       
-       movlps xmm7, [esi + eax*4 + 8]
-       movlps xmm3, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + ebx*4 + 8]
-       movhps xmm3, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + eax*4 + 16]
-       movlps xmm7, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + ebx*4 + 16]
-       movhps xmm7, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + eax*4 + 24]
-       movlps xmm3, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + ebx*4 + 24]
-       movhps xmm3, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       movd eax, mm0   
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-
-       mov    edi, [ebp + %$faction]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; the fj's - start by accumulating x & y forces from memory
-       movlps xmm4, [edi + eax*4]
-       movlps xmm6, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm6, [edi + edx*4]
-
-       movaps xmm3, xmm4
-       shufps xmm3, xmm6, 10001000b
-       shufps xmm4, xmm6, 11011101b                          
-
-       ; now xmm3-xmm5 contains fjx, fjy, fjz
-       subps  xmm3, xmm0
-       subps  xmm4, xmm1
-       
-       ; unpack them back so we can store them - first x & y in xmm3/xmm4
-
-       movaps xmm6, xmm3
-       unpcklps xmm6, xmm4
-       unpckhps xmm3, xmm4     
-       ; xmm6(l)=x & y for j1, (h) for j2
-       ; xmm3(l)=x & y for j3, (h) for j4
-       movlps [edi + eax*4], xmm6
-       movlps [edi + ecx*4], xmm3
-       
-       movhps [edi + ebx*4], xmm6
-       movhps [edi + edx*4], xmm3
-
-       ;;  and the z forces
-       movss  xmm4, [edi + eax*4 + 8]
-       movss  xmm5, [edi + ebx*4 + 8]
-       movss  xmm6, [edi + ecx*4 + 8]
-       movss  xmm7, [edi + edx*4 + 8]
-       subss  xmm4, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm5, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm6, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm7, xmm2
-       movss  [edi + eax*4 + 8], xmm4
-       movss  [edi + ebx*4 + 8], xmm5
-       movss  [edi + ecx*4 + 8], xmm6
-       movss  [edi + edx*4 + 8], xmm7
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .finish_vdw_inner
-       jmp   .unroll_vdw_loop
-.finish_vdw_inner:
-       ;;  check if at least two particles remain
-       add   [esp + .innerk], dword 4
-       mov   edx, [esp + .innerk]
-       and   edx, 10b
-       jnz   .dopair_vdw
-       jmp   .checksingle_vdw
-.dopair_vdw:   
-        mov   ecx, [esp + .innerjjnr]
-       
-       mov   eax, [ecx]        
-       mov   ebx, [ecx + 4]              
-       add   [esp + .innerjjnr], dword 8       
-       xorps xmm7, xmm7
-
-       mov esi, [ebp + %$type]
-       mov   ecx, eax
-       mov   edx, ebx
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add ecx, edi
-       add edx, edi
-       movlps xmm6, [esi + ecx*4]
-       movhps xmm6, [esi + edx*4]
-       mov edi, [ebp + %$pos]  
-       
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 1000b        
-       shufps xmm6, xmm6, 1101b
-       movlhps xmm4, xmm7
-       movlhps xmm6, xmm7
-       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-                       
-       lea   eax, [eax + eax*2]
-       lea   ebx, [ebx + ebx*2]
-       ; move coordinates to xmm0-xmm2
-       movlps xmm1, [edi + eax*4]
-       movss xmm2, [edi + eax*4 + 8]   
-       movhps xmm1, [edi + ebx*4]
-       movss xmm0, [edi + ebx*4 + 8]   
-
-       movlhps xmm3, xmm7
-       
-       shufps xmm2, xmm0, 0b
-       
-       movaps xmm0, xmm1
-
-       shufps xmm2, xmm2, 10001000b
-       
-       shufps xmm0, xmm0, 10001000b
-       shufps xmm1, xmm1, 11011101b
-                       
-       mov    edi, [ebp + %$faction]
-       ; move ix-iz to xmm4-xmm6
-       xorps   xmm7, xmm7
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ecx, mm6
-       psrlq mm6, 32
-       movd edx, mm6
-
-       lea   ecx, [ecx + ecx*2] 
-       lea   edx, [edx + edx*2] 
-
-       ; dispersion
-       movlps xmm5, [esi + ecx*4 + 0]
-       movhps xmm5, [esi + edx*4 + 0]; got half dispersion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm4, 10001000b
-       shufps xmm5, xmm5, 11011101b 
-       
-       movlps xmm7, [esi + ecx*4 + 8]
-       movhps xmm7, [esi + edx*4 + 8] ; other half of dispersion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 10001000b
-       shufps xmm7, xmm7, 11011101b
-       ; dispersion table ready, in xmm4-xmm7  
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm5, [esi + ecx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 16] ; got half repulsion table
-       movaps xmm4, xmm5
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm5, xmm7, 11011101b 
-
-       movlps xmm7, [esi + ecx*4 + 24]
-       movhps xmm7, [esi + edx*4 + 24] ; other half of repulsion table
-       movaps xmm6, xmm7
-       shufps xmm6, xmm3, 10001000b
-       shufps xmm7, xmm3, 11011101b 
-       ; table ready, in xmm4-xmm7     
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update the fj's
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-
-       shufps  xmm0, xmm0, 11100001b
-       shufps  xmm1, xmm1, 11100001b
-       shufps  xmm2, xmm2, 11100001b
-
-       movss   xmm3, [edi + ebx*4]
-       movss   xmm4, [edi + ebx*4 + 4]
-       movss   xmm5, [edi + ebx*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + ebx*4], xmm3
-       movss   [edi + ebx*4 + 4], xmm4
-       movss   [edi + ebx*4 + 8], xmm5 
-
-.checksingle_vdw:                              
-       mov   edx, [esp + .innerk]
-       and   edx, 1b
-       jnz    .dosingle_vdw
-       jmp    .updateouterdata_vdw
-.dosingle_vdw:
-       mov edi, [ebp + %$pos]
-       mov   ecx, [esp + .innerjjnr]
-       mov   eax, [ecx]        
-       xorps  xmm6, xmm6
-
-       mov esi, [ebp + %$type]
-       mov ecx, eax
-       mov ecx, [esi + ecx*4]  
-       mov esi, [ebp + %$nbfp]
-       shl ecx, 1
-       add ecx, [esp + .ntia]
-       movlps xmm6, [esi + ecx*4]
-       movaps xmm4, xmm6
-       shufps xmm4, xmm4, 11111100b    
-       shufps xmm6, xmm6, 11111101b    
-                       
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6       
-               
-       lea   eax, [eax + eax*2]
-       
-       ; move coordinates to xmm0-xmm2
-       movss xmm0, [edi + eax*4]       
-       movss xmm1, [edi + eax*4 + 4]   
-       movss xmm2, [edi + eax*4 + 8]    
-       
-       movaps xmm4, [esp + .ix]
-       movaps xmm5, [esp + .iy]
-       movaps xmm6, [esp + .iz]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dx], xmm4
-       movaps [esp + .dy], xmm5
-       movaps [esp + .dz], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       ; rsq in xmm4
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-
-       mulps xmm4, xmm0        ; xmm4=r
-       mulps xmm4, [esp + .tabscale]
-
-       cvttps2pi mm6, xmm4     ; mm6 contain lu indices
-       cvtpi2ps xmm6, mm6
-       subps xmm4, xmm6        
-       movaps xmm1, xmm4       ;xmm1=eps
-       movaps xmm2, xmm1       
-       mulps  xmm2, xmm2       ;xmm2=eps2
-
-       pslld mm6, 2
-
-       mov  esi, [ebp + %$VFtab]
-       movd ebx, mm6
-
-       lea   ebx, [ebx + ebx*2]        
-
-       ; dispersion
-       movlps xmm4, [esi + ebx*4 + 0]
-       movlps xmm6, [esi + ebx*4 + 8]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-
-       movaps xmm4, [esp + .c6]
-       mulps  xmm7, xmm4        ; fijD
-       mulps  xmm5, xmm4        ; vnb6
-
-       ; put scalar force on stack. Update vnbtot directly.
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .fs], xmm7
-       movaps [esp + .vnbtot], xmm5
-
-       ; repulsion
-       movlps xmm4, [esi + ebx*4 + 16]
-       movlps xmm6, [esi + ebx*4 + 24]
-       movaps xmm5, xmm4
-       movaps xmm7, xmm6
-       shufps xmm5, xmm5, 1b
-       shufps xmm7, xmm7, 1b
-       ; table ready in xmm4-xmm7
-       
-       mulps  xmm6, xmm1       ; xmm6=Geps
-       mulps  xmm7, xmm2       ; xmm7=Heps2
-       addps  xmm5, xmm6
-       addps  xmm5, xmm7       ; xmm5=Fp       
-       mulps  xmm7, [esp + .two]       ; two*Heps2
-       addps  xmm7, xmm6
-       addps  xmm7, xmm5 ; xmm7=FF
-       mulps  xmm5, xmm1 ; xmm5=eps*Fp
-       addps  xmm5, xmm4 ; xmm5=VV
-       
-       movaps xmm4, [esp + .c12]
-       mulps  xmm7, xmm4 ; fijR
-       mulps  xmm5, xmm4 ; vnb12
-       addps  xmm7, [esp + .fs] 
-       
-       addps  xmm5, [esp + .vnbtot]
-       movaps [esp + .vnbtot], xmm5
-       xorps  xmm4, xmm4
-
-       mulps xmm7, [esp + .tabscale]
-       mulps xmm7, xmm0
-       subps  xmm4, xmm7
-       mov    edi, [ebp + %$faction]
-
-       movaps xmm0, [esp + .dx]
-       movaps xmm1, [esp + .dy]
-       movaps xmm2, [esp + .dz]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-       ; xmm0-xmm2 contains tx-tz (partial force)
-       ; now update f_i 
-       movaps xmm3, [esp + .fix]
-       movaps xmm4, [esp + .fiy]
-       movaps xmm5, [esp + .fiz]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm5, xmm2
-       movaps [esp + .fix], xmm3
-       movaps [esp + .fiy], xmm4
-       movaps [esp + .fiz], xmm5
-       ; update fj
-       
-       movss   xmm3, [edi + eax*4]
-       movss   xmm4, [edi + eax*4 + 4]
-       movss   xmm5, [edi + eax*4 + 8]
-       subss   xmm3, xmm0
-       subss   xmm4, xmm1
-       subss   xmm5, xmm2      
-       movss   [edi + eax*4], xmm3
-       movss   [edi + eax*4 + 4], xmm4
-       movss   [edi + eax*4 + 8], xmm5 
-.updateouterdata_vdw:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fix]
-       movaps xmm1, [esp + .fiy]
-       movaps xmm2, [esp + .fiz]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; increment fshift force
-       movss  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 4]
-       movss  xmm5, [esi + edx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esi + edx*4],     xmm3
-       movss  [esi + edx*4 + 4], xmm4
-       movss  [esi + edx*4 + 8], xmm5
-       
-       ;;  loop back to mno.
-       dec dword [esp + .nsvdw]
-       jz  .last_mno
-       jmp .mno_vdw
-.last_mno:     
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 380
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
-
-proc inl3320_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg     
-%$tabscale     arg     
-%$VFtab                arg     
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.iqO         equ   144 
-.iqH         equ   160 
-.dxO         equ   176
-.dyO         equ   192
-.dzO         equ   208 
-.dxH1        equ   224
-.dyH1        equ   240
-.dzH1        equ   256 
-.dxH2        equ   272
-.dyH2        equ   288
-.dzH2        equ   304 
-.qqO         equ   320
-.qqH         equ   336
-.rinvO       equ   352
-.rinvH1      equ   368
-.rinvH2             equ   384          
-.rO          equ   400
-.rH1         equ   416
-.rH2         equ   432
-.tabscale    equ   448 
-.two         equ   464
-.c6          equ   480
-.c12         equ   496
-.vctot       equ   512
-.vnbtot      equ   528
-.fixO        equ   544
-.fiyO        equ   560
-.fizO        equ   576
-.fixH1       equ   592
-.fiyH1       equ   608
-.fizH1       equ   624
-.fixH2       equ   640
-.fiyH2       equ   656
-.fizH2       equ   672
-.fjx        equ   688
-.fjy         equ   704
-.fjz         equ   720
-.half        equ   736
-.three       equ   752
-.is3         equ   768
-.ii3         equ   772
-.ntia       equ   776  
-.innerjjnr   equ   780
-.innerk      equ   784
-.salign             equ   788                                                          
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 792            ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp +%$tabscale]
-       
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two],  xmm1
-       movaps [esp + .three],  xmm2
-       shufps xmm3, xmm3, 0b 
-       movaps [esp + .tabscale], xmm3
-       
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, [edx + ebx*4 + 4]   
-       movss xmm5, [ebp + %$facel]
-       mulss  xmm3, xmm5
-       mulss  xmm4, xmm5
-
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       movaps [esp + .iqO], xmm3
-       movaps [esp + .iqH], xmm4
-       
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       mov   [esp + .ntia], ecx                
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx
-
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-       
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .odd_inner
-.unroll_loop:
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4]              
-       mov   ecx, [edx + 8]            
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$charge]        ; base of charge[]
-       
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + ecx*4]
-       movss xmm6, [esi + ebx*4]
-       movss xmm7, [esi + edx*4]
-
-       shufps xmm3, xmm6, 00000000b 
-       shufps xmm4, xmm7, 00000000b 
-       shufps xmm3, xmm4, 10001000b ;  all charges in xmm3
-       movaps xmm4, xmm3            ;  and in xmm4
-       mulps  xmm3, [esp + .iqO]
-       mulps  xmm4, [esp + .iqH]
-
-       movd  mm0, eax          ;  use mmx registers as temp. storage
-       movd  mm1, ebx
-       movd  mm2, ecx
-       movd  mm3, edx
-
-       movaps  [esp + .qqO], xmm3
-       movaps  [esp + .qqH], xmm4
-       
-       mov esi, [ebp + %$type]
-       mov eax, [esi + eax*4]
-       mov ebx, [esi + ebx*4]
-       mov ecx, [esi + ecx*4]
-       mov edx, [esi + edx*4]
-       mov esi, [ebp + %$nbfp]
-       shl eax, 1      
-       shl ebx, 1      
-       shl ecx, 1      
-       shl edx, 1      
-       mov edi, [esp + .ntia]
-       add eax, edi
-       add ebx, edi
-       add ecx, edi
-       add edx, edi
-
-       movlps xmm6, [esi + eax*4]
-       movlps xmm7, [esi + ecx*4]
-       movhps xmm6, [esi + ebx*4]
-       movhps xmm7, [esi + edx*4]
-
-       movaps xmm4, xmm6
-       shufps xmm4, xmm7, 10001000b
-       shufps xmm6, xmm7, 11011101b
-       
-       movd  eax, mm0          
-       movd  ebx, mm1
-       movd  ecx, mm2
-       movd  edx, mm3
-
-       movaps [esp + .c6], xmm4
-       movaps [esp + .c12], xmm6
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-
-       ; move four coordinates to xmm0-xmm2    
-       movlps xmm4, [esi + eax*4]
-       movlps xmm5, [esi + ecx*4]
-       movss xmm2, [esi + eax*4 + 8]
-       movss xmm6, [esi + ecx*4 + 8]
-
-       movhps xmm4, [esi + ebx*4]
-       movhps xmm5, [esi + edx*4]
-
-       movss xmm0, [esi + ebx*4 + 8]
-       movss xmm1, [esi + edx*4 + 8]
-
-       shufps xmm2, xmm0, 0b
-       shufps xmm6, xmm1, 0b
-       
-       movaps xmm0, xmm4
-       movaps xmm1, xmm4
-
-       shufps xmm2, xmm6, 10001000b
-       
-       shufps xmm0, xmm5, 10001000b
-       shufps xmm1, xmm5, 11011101b            
-
-       ; move ixO-izO to xmm4-xmm6
-       movaps xmm4, [esp + .ixO]
-       movaps xmm5, [esp + .iyO]
-       movaps xmm6, [esp + .izO]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxO], xmm4
-       movaps [esp + .dyO], xmm5
-       movaps [esp + .dzO], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm4, xmm5
-       addps xmm4, xmm6
-       movaps xmm7, xmm4
-       ; rsqO in xmm7
-
-       ; move ixH1-izH1 to xmm4-xmm6
-       movaps xmm4, [esp + .ixH1]
-       movaps xmm5, [esp + .iyH1]
-       movaps xmm6, [esp + .izH1]
-
-       ; calc dr
-       subps xmm4, xmm0
-       subps xmm5, xmm1
-       subps xmm6, xmm2
-
-       ; store dr
-       movaps [esp + .dxH1], xmm4
-       movaps [esp + .dyH1], xmm5
-       movaps [esp + .dzH1], xmm6
-       ; square it
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       mulps xmm6,xmm6
-       addps xmm6, xmm5
-       addps xmm6, xmm4
-       ; rsqH1 in xmm6
-
-       ; move ixH2-izH2 to xmm3-xmm5
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-
-       ; calc dr
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-
-       ; store dr
-       movaps [esp + .dxH2], xmm3
-       movaps [esp + .dyH2], xmm4
-       movaps [esp + .dzH2], xmm5
-       ; square it
-       mulps xmm3,xmm3
-       mulps xmm4,xmm4
-       mulps xmm5,xmm5
-       addps xmm5, xmm4
-       addps xmm5, xmm3
-       ; rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
-
-       ; start with rsqO - seed to xmm2        
-       rsqrtps xmm2, xmm7
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm7      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvO], xmm4    ; rinvO in xmm4
-       mulps   xmm7, xmm4
-       movaps  [esp + .rO], xmm7       
-
-       ; rsqH1 - seed in xmm2  
-       rsqrtps xmm2, xmm6
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm6      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvH1], xmm4   ; rinvH1 in xmm4
-       mulps   xmm6, xmm4
-       movaps  [esp + .rH1], xmm6
-
-       ; rsqH2 - seed to xmm2  
-       rsqrtps xmm2, xmm5
-       movaps  xmm3, xmm2
-       mulps   xmm2, xmm2
-       movaps  xmm4, [esp + .three]
-       mulps   xmm2, xmm5      ; rsq*lu*lu
-       subps   xmm4, xmm2      ; 3.0-rsq*lu*lu
-       mulps   xmm4, xmm3      ; lu*(3-rsq*lu*lu)
-       mulps   xmm4, [esp + .half]
-       movaps  [esp + .rinvH2], xmm4   ; rinvH2 in xmm4
-       mulps   xmm5, xmm4
-       movaps  [esp + .rH2], xmm5
-
-       ; do O interactions
-       ;; rO is still in xmm7.
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd mm0, eax   
-        movd mm1, ebx
-        movd mm2, ecx
-        movd mm3, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm0, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5 
-
-        ; dispersion
-        movlps xmm5, [esi + eax*4 + 16]
-        movlps xmm7, [esi + ecx*4 + 16]
-        movhps xmm5, [esi + ebx*4 + 16]
-        movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-        
-        movlps xmm7, [esi + eax*4 + 24]
-        movlps xmm3, [esi + ecx*4 + 24]
-        movhps xmm7, [esi + ebx*4 + 24]
-        movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; dispersion table ready, in xmm4-xmm7  
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-
-        movaps xmm4, [esp + .c6]
-        mulps  xmm7, xmm4        ; fijD
-        mulps  xmm5, xmm4        ; vnb6
-        addps  xmm0, xmm7 ; add to fscal
-
-        ; Update vnbtot directly.
-        addps  xmm5, [esp + .vnbtot]
-        movaps [esp + .vnbtot], xmm5
-
-        ; repulsion
-        movlps xmm5, [esi + eax*4 + 32]
-        movlps xmm7, [esi + ecx*4 + 32]
-        movhps xmm5, [esi + ebx*4 + 32]
-        movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 40]
-        movlps xmm3, [esi + ecx*4 + 40]
-        movhps xmm7, [esi + ebx*4 + 40]
-        movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; repulsion table ready, in xmm4-xmm7          
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-
-        movaps xmm4, [esp + .c12]
-        mulps  xmm7, xmm4        ; fijD
-        mulps  xmm5, xmm4        ; vnb12
-        addps  xmm7, xmm0 ; add to fscal
-        addps  xmm5, [esp + .vnbtot] ;  total nonbonded potential in xmm5.
-       xorps xmm4, xmm4
-       
-       mulps  xmm7, [esp + .rinvO] ;  total fscal now in xmm7
-
-       mulps  xmm7, [esp + .tabscale]
-        movaps [esp + .vnbtot], xmm5
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4       ;  tx in xmm0-xmm2
-
-       ; update O forces
-       movaps xmm3, [esp + .fixO]
-       movaps xmm4, [esp + .fiyO]
-       movaps xmm7, [esp + .fizO]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixO], xmm3
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm7
-       ; update j forces with water O
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ;;  Done with O interactions - now H1!
-       movaps xmm7, [esp + .rH1]
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm7, xmm0 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm7 fijC.
-        ; increment vcoul
-       xorps  xmm4, xmm4
-        addps  xmm5, [esp + .vctot]
-       mulps  xmm7, [esp + .rinvH1]
-        movaps [esp + .vctot], xmm5 
-       mulps  xmm7, [esp + .tabscale]
-       subps xmm4, xmm7
-
-       movaps xmm0, [esp + .dxH1]
-       movaps xmm1, [esp + .dyH1]
-       movaps xmm2, [esp + .dzH1]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-       ; update H1 forces
-       movaps xmm3, [esp + .fixH1]
-       movaps xmm4, [esp + .fiyH1]
-       movaps xmm7, [esp + .fizH1]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH1], xmm3
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm7
-       ; update j forces with water H1
-       addps  xmm0, [esp + .fjx]
-       addps  xmm1, [esp + .fjy]
-       addps  xmm2, [esp + .fjz]
-       movaps [esp + .fjx], xmm0
-       movaps [esp + .fjy], xmm1
-       movaps [esp + .fjz], xmm2
-
-       ; Done with H1, finally we do H2 interactions
-       movaps xmm7, [esp + .rH2]
-       mulps   xmm7, [esp + .tabscale]
-       movhlps xmm4, xmm7
-       cvttps2pi mm6, xmm7
-       cvttps2pi mm7, xmm4    ; mm6/mm7 contain lu indices
-       
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm4, mm7
-        movlhps xmm3, xmm4
-       
-        subps xmm7, xmm3
-       movaps xmm1, xmm7       ;  xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-               
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm7, xmm0 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul
-       xorps  xmm4, xmm4
-        addps  xmm5, [esp + .vctot]
-       mulps  xmm7, [esp + .rinvH2]
-        movaps [esp + .vctot], xmm5 
-       mulps  xmm7, [esp + .tabscale]
-       subps  xmm4, xmm7
-
-       movaps xmm0, [esp + .dxH2]
-       movaps xmm1, [esp + .dyH2]
-       movaps xmm2, [esp + .dzH2]
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4
-
-        movd eax, mm0   
-        movd ebx, mm1
-        movd ecx, mm2
-        movd edx, mm3
-       
-       ; update H2 forces
-       movaps xmm3, [esp + .fixH2]
-       movaps xmm4, [esp + .fiyH2]
-       movaps xmm7, [esp + .fizH2]
-       addps  xmm3, xmm0
-       addps  xmm4, xmm1
-       addps  xmm7, xmm2
-       movaps [esp + .fixH2], xmm3
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm7
-
-       mov edi, [ebp +%$faction]
-       ; update j forces
-       addps xmm0, [esp + .fjx]
-       addps xmm1, [esp + .fjy]
-       addps xmm2, [esp + .fjz]
-
-       movlps xmm4, [edi + eax*4]
-       movlps xmm7, [edi + ecx*4]
-       movhps xmm4, [edi + ebx*4]
-       movhps xmm7, [edi + edx*4]
-       
-       movaps xmm3, xmm4
-       shufps xmm3, xmm7, 10001000b
-       shufps xmm4, xmm7, 11011101b                          
-       ; xmm3 has fjx, xmm4 has fjy.
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       ; unpack the back for storing.
-       movaps xmm7, xmm3
-       unpcklps xmm7, xmm4
-       unpckhps xmm3, xmm4     
-       movlps [edi + eax*4], xmm7
-       movlps [edi + ecx*4], xmm3
-       movhps [edi + ebx*4], xmm7
-       movhps [edi + edx*4], xmm3
-       ; finally z forces 
-       movss  xmm0, [edi + eax*4 + 8]
-       movss  xmm1, [edi + ebx*4 + 8]
-       movss  xmm3, [edi + ecx*4 + 8]
-       movss  xmm4, [edi + edx*4 + 8]
-       subss  xmm0, xmm2
-       shufps xmm2, xmm2, 11100101b
-       subss  xmm1, xmm2
-       shufps xmm2, xmm2, 11101010b
-       subss  xmm3, xmm2
-       shufps xmm2, xmm2, 11111111b
-       subss  xmm4, xmm2
-       movss  [edi + eax*4 + 8], xmm0
-       movss  [edi + ebx*4 + 8], xmm1
-       movss  [edi + ecx*4 + 8], xmm3
-       movss  [edi + edx*4 + 8], xmm4
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .odd_inner
-       jmp   .unroll_loop
-.odd_inner:    
-       add   [esp + .innerk], dword 4
-       jnz   .odd_loop
-       jmp   .updateouterdata
-.odd_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       xorps xmm4, xmm4
-       movss xmm4, [esp + .iqO]
-       mov esi, [ebp + %$charge] 
-       movhps xmm4, [esp + .iqH]     
-       movss xmm3, [esi + eax*4]       ; charge in xmm3
-       shufps xmm3, xmm3, 0b
-       mulps xmm3, xmm4
-       movaps [esp + .qqO], xmm3       ; use oxygen qq for storage.
-
-       xorps xmm6, xmm6
-       mov esi, [ebp + %$type]
-       mov ebx, [esi + eax*4]
-       mov esi, [ebp + %$nbfp]
-       shl ebx, 1      
-       add ebx, [esp + .ntia]
-       movlps xmm6, [esi + ebx*4]
-       movaps xmm7, xmm6
-       shufps xmm6, xmm6, 11111100b
-       shufps xmm7, xmm7, 11111101b
-       movaps [esp + .c6], xmm6
-       movaps [esp + .c12], xmm7
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-       
-       ; move j coords to xmm0-xmm2
-       movss xmm0, [esi + eax*4]
-       movss xmm1, [esi + eax*4 + 4]
-       movss xmm2, [esi + eax*4 + 8]
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       
-       movss xmm3, [esp + .ixO]
-       movss xmm4, [esp + .iyO]
-       movss xmm5, [esp + .izO]
-               
-       movlps xmm6, [esp + .ixH1]
-       movlps xmm7, [esp + .ixH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm3, xmm6
-       movlps xmm6, [esp + .iyH1]
-       movlps xmm7, [esp + .iyH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm4, xmm6
-       movlps xmm6, [esp + .izH1]
-       movlps xmm7, [esp + .izH2]
-       unpcklps xmm6, xmm7
-       movlhps xmm5, xmm6
-
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       
-       movaps [esp + .dxO], xmm3
-       movaps [esp + .dyO], xmm4
-       movaps [esp + .dzO], xmm5
-
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       ; rsq in xmm4.
-
-       rsqrtps xmm5, xmm4
-       ; lookup seed in xmm5
-       movaps xmm2, xmm5
-       mulps xmm5, xmm5
-       movaps xmm1, [esp + .three]
-       mulps xmm5, xmm4        ;rsq*lu*lu                      
-       movaps xmm0, [esp + .half]
-       subps xmm1, xmm5        ; 3.0-rsq*lu*lu
-       mulps xmm1, xmm2        
-       mulps xmm0, xmm1        ; xmm0=rinv
-       mulps xmm4, xmm0        ; xmm4=r
-       movaps [esp + .rinvO], xmm0
-       
-       mulps xmm4, [esp + .tabscale]
-       movhlps xmm7, xmm4
-       cvttps2pi mm6, xmm4
-       cvttps2pi mm7, xmm7    ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm7, mm7
-        movlhps xmm3, xmm7
-
-       subps   xmm4, xmm3      
-       movaps xmm1, xmm4       ; xmm1=eps
-       movaps xmm2, xmm1
-       mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-       
-        movd mm0, eax   
-        movd mm1, ecx
-        movd mm2, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-       
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7     
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp       
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm0, [esp + .qqO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm0 ; vcoul=qq*VV
-        mulps  xmm0, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and xmm0 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       
-        ; dispersion
-        movlps xmm5, [esi + eax*4 + 16]        ;  half table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm4, 11111100b
-        shufps xmm5, xmm5, 11111101b 
-        
-        movlps xmm7, [esi + eax*4 + 24] ; other half of dispersion table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm6, 11111100b
-        shufps xmm7, xmm7, 11111101b 
-        ; dispersion table ready, in xmm4-xmm7  
-        mulss  xmm6, xmm1       ; xmm6=Geps
-        mulss  xmm7, xmm2       ; xmm7=Heps2
-        addss  xmm5, xmm6
-        addss  xmm5, xmm7       ; xmm5=Fp       
-        mulss  xmm7, [esp + .two]       ; two*Heps2
-        addss  xmm7, xmm6
-        addss  xmm7, xmm5 ; xmm7=FF
-        mulss  xmm5, xmm1 ; xmm5=eps*Fp
-        addss  xmm5, xmm4 ; xmm5=VV
-
-        movaps xmm4, [esp + .c6]
-        mulps  xmm7, xmm4        ; fijD
-        mulps  xmm5, xmm4        ; vnb6
-        addps  xmm0, xmm7 ; add to fscal
-
-        ; Update vnbtot directly.
-        addps  xmm5, [esp + .vnbtot]
-        movaps [esp + .vnbtot], xmm5
-
-        ; repulsion
-        movlps xmm5, [esi + eax*4 + 32] ; got half repulsion table
-        movaps xmm4, xmm5
-        shufps xmm4, xmm4, 10001000b
-        shufps xmm5, xmm5, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 40] ; other half of repulsion table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm6, 10001000b
-        shufps xmm7, xmm7, 11011101b 
-        ; repulsion table ready, in xmm4-xmm7          
-        mulss  xmm6, xmm1       ; xmm6=Geps
-        mulss  xmm7, xmm2       ; xmm7=Heps2
-        addss  xmm5, xmm6
-        addss  xmm5, xmm7       ; xmm5=Fp       
-        mulss  xmm7, [esp + .two]       ; two*Heps2
-        addss  xmm7, xmm6
-        addss  xmm7, xmm5 ; xmm7=FF
-        mulss  xmm5, xmm1 ; xmm5=eps*Fp
-        addss  xmm5, xmm4 ; xmm5=VV
-
-        movaps xmm4, [esp + .c12]
-        mulps  xmm7, xmm4        ; fijD
-        mulps  xmm5, xmm4        ; vnb12
-        addps  xmm7, xmm0 ; add to fscal
-        addps  xmm5, [esp + .vnbtot] ;  total nonbonded potential in xmm5.
-
-       xorps  xmm4, xmm4
-        movd eax, mm0   
-        movd ecx, mm1
-        movd edx, mm2  
-               
-       mulps  xmm7, [esp + .rinvO] ;  total fscal now in xmm7
-        movaps [esp + .vnbtot], xmm5
-       mulps  xmm7, [esp + .tabscale]
-       subps xmm4, xmm7
-
-       movaps xmm0, [esp + .dxO]
-       movaps xmm1, [esp + .dyO]
-       movaps xmm2, [esp + .dzO]
-
-       mulps  xmm0, xmm4
-       mulps  xmm1, xmm4
-       mulps  xmm2, xmm4 ; xmm0-xmm2 now contains tx-tz (partial force)
-       movss  xmm3, [esp + .fixO]      
-       movss  xmm4, [esp + .fiyO]      
-       movss  xmm5, [esp + .fizO]      
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [esp + .fixO], xmm3      
-       movss  [esp + .fiyO], xmm4      
-       movss  [esp + .fizO], xmm5      ; updated the O force. now do the H's
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       shufps xmm3, xmm3, 11100110b    ; shift right
-       shufps xmm4, xmm4, 11100110b
-       shufps xmm5, xmm5, 11100110b
-       addss  xmm3, [esp + .fixH1]
-       addss  xmm4, [esp + .fiyH1]
-       addss  xmm5, [esp + .fizH1]
-       movss  [esp + .fixH1], xmm3     
-       movss  [esp + .fiyH1], xmm4     
-       movss  [esp + .fizH1], xmm5     ; updated the H1 force. 
-
-       mov edi, [ebp + %$faction]
-       shufps xmm3, xmm3, 11100111b    ; shift right
-       shufps xmm4, xmm4, 11100111b
-       shufps xmm5, xmm5, 11100111b
-       addss  xmm3, [esp + .fixH2]
-       addss  xmm4, [esp + .fiyH2]
-       addss  xmm5, [esp + .fizH2]
-       movss  [esp + .fixH2], xmm3     
-       movss  [esp + .fiyH2], xmm4     
-       movss  [esp + .fizH2], xmm5     ; updated the H2 force. 
-
-       ; the fj's - start by accumulating the tx/ty/tz force in xmm0, xmm1.
-       xorps  xmm5, xmm5
-       movaps xmm3, xmm0
-       movlps xmm6, [edi + eax*4]
-       movss  xmm7, [edi + eax*4 + 8]
-       unpcklps xmm3, xmm1
-       movlhps  xmm3, xmm5     
-       unpckhps xmm0, xmm1             
-       addps    xmm0, xmm3
-       movhlps  xmm3, xmm0     
-       addps    xmm0, xmm3     ; x,y sum in xmm0
-
-       movhlps  xmm1, xmm2
-       addss    xmm2, xmm1
-       shufps   xmm1, xmm1, 1b 
-       addss    xmm2, xmm1    ; z sum in xmm2
-       subps    xmm6, xmm0
-       subss    xmm7, xmm2
-       
-       movlps [edi + eax*4],     xmm6
-       movss  [edi + eax*4 + 8], xmm7
-
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .odd_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO]
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       mov   edx, [ebp + %$gid]  
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4    
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-        
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 792
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-       
-
-       
-proc inl3330_sse
-%$nri          arg
-%$iinr         arg
-%$jindex       arg
-%$jjnr         arg
-%$shift                arg
-%$shiftvec     arg
-%$fshift       arg
-%$gid          arg
-%$pos          arg             
-%$faction      arg
-%$charge       arg
-%$facel                arg
-%$Vc           arg                     
-%$type         arg
-%$ntype        arg
-%$nbfp         arg     
-%$Vnb          arg
-%$tabscale     arg     
-%$VFtab                arg
-       ;; stack offsets for local variables
-       ;; bottom of stack is cache-aligned for sse use
-.ixO        equ     0
-.iyO        equ    16
-.izO         equ    32
-.ixH1       equ    48
-.iyH1       equ    64
-.izH1        equ    80
-.ixH2       equ    96
-.iyH2       equ   112
-.izH2        equ   128
-.jxO        equ   144
-.jyO        equ   160
-.jzO         equ   176
-.jxH1       equ   192
-.jyH1       equ   208
-.jzH1        equ   224
-.jxH2       equ   240
-.jyH2       equ   256
-.jzH2        equ   272
-.dxOO        equ   288
-.dyOO        equ   304
-.dzOO        equ   320 
-.dxOH1       equ   336
-.dyOH1       equ   352
-.dzOH1       equ   368 
-.dxOH2       equ   384
-.dyOH2       equ   400
-.dzOH2       equ   416 
-.dxH1O       equ   432
-.dyH1O       equ   448
-.dzH1O       equ   464 
-.dxH1H1      equ   480
-.dyH1H1      equ   496
-.dzH1H1      equ   512 
-.dxH1H2      equ   528
-.dyH1H2      equ   544
-.dzH1H2      equ   560 
-.dxH2O       equ   576
-.dyH2O       equ   592
-.dzH2O       equ   608 
-.dxH2H1      equ   624
-.dyH2H1      equ   640
-.dzH2H1      equ   656 
-.dxH2H2      equ   672
-.dyH2H2      equ   688
-.dzH2H2      equ   704
-.qqOO        equ   720
-.qqOH        equ   736
-.qqHH        equ   752
-.two         equ   768
-.tabscale    equ   784
-.c6          equ   800
-.c12        equ   816           
-.vctot       equ   832
-.vnbtot      equ   848
-.fixO        equ   864
-.fiyO        equ   880
-.fizO        equ   896
-.fixH1       equ   912
-.fiyH1       equ   928
-.fizH1       equ   944
-.fixH2       equ   960
-.fiyH2       equ   976
-.fizH2       equ   992
-.fjxO       equ  1008
-.fjyO        equ  1024
-.fjzO        equ  1040
-.fjxH1      equ  1056
-.fjyH1       equ  1072
-.fjzH1       equ  1088
-.fjxH2      equ  1104
-.fjyH2       equ  1120
-.fjzH2       equ  1136
-.half        equ  1152
-.three       equ  1168
-.rsqOO       equ  1184
-.rsqOH1      equ  1200
-.rsqOH2      equ  1216
-.rsqH1O      equ  1232
-.rsqH1H1     equ  1248
-.rsqH1H2     equ  1264
-.rsqH2O      equ  1280
-.rsqH2H1     equ  1296
-.rsqH2H2     equ  1312
-.rinvOO      equ  1328
-.rinvOH1     equ  1344
-.rinvOH2     equ  1360
-.rinvH1O     equ  1376
-.rinvH1H1    equ  1392
-.rinvH1H2    equ  1408
-.rinvH2O     equ  1424
-.rinvH2H1    equ  1440
-.rinvH2H2    equ  1456
-.fstmp      equ  1472  
-.is3         equ  1488
-.ii3         equ  1492
-.innerjjnr   equ  1496
-.innerk      equ  1500
-.salign             equ  1504                                                  
-        push eax
-        push ebx
-        push ecx
-        push edx
-       push esi
-       push edi
-       sub esp, 1508           ; local stack space
-       mov  eax, esp
-       and  eax, 0xf
-       sub esp, eax
-       mov [esp + .salign], eax
-
-       emms
-
-       movups xmm0, [sse_half]
-       movups xmm1, [sse_two]
-       movups xmm2, [sse_three]
-       movss xmm3, [ebp +%$tabscale]
-       movaps [esp + .half],  xmm0
-       movaps [esp + .two],  xmm1
-       movaps [esp + .three], xmm2
-       shufps xmm3, xmm3, 0b
-       movaps [esp + .tabscale],  xmm3
-
-       ;; assume we have at least one i particle - start directly
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       mov   edx, [ebp + %$charge]
-       movss xmm3, [edx + ebx*4]       
-       movss xmm4, xmm3        
-       movss xmm5, [edx + ebx*4 + 4]   
-       movss xmm6, [ebp + %$facel]
-       mulss  xmm3, xmm3
-       mulss  xmm4, xmm5
-       mulss  xmm5, xmm5
-       mulss  xmm3, xmm6
-       mulss  xmm4, xmm6
-       mulss  xmm5, xmm6
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .qqOO], xmm3
-       movaps [esp + .qqOH], xmm4
-       movaps [esp + .qqHH], xmm5
-               
-       xorps xmm0, xmm0
-       mov   edx, [ebp + %$type]
-       mov   ecx, [edx + ebx*4]
-       shl   ecx, 1
-       mov   edx, ecx
-       imul  ecx, [ebp + %$ntype]      ; ecx = ntia = 2*ntype*type[ii0]
-       add   edx, ecx
-       mov   eax, [ebp + %$nbfp]
-       movlps xmm0, [eax + edx*4] 
-       movaps xmm1, xmm0
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 01010101b
-       movaps [esp + .c6], xmm0
-       movaps [esp + .c12], xmm1
-
-.outer:
-       mov   eax, [ebp + %$shift]      ;  eax = pointer into shift[]
-       mov   ebx, [eax]                ;  ebx=shift[n]
-       add   [ebp + %$shift], dword 4  ;  advance pointer one step
-       
-       lea   ebx, [ebx + ebx*2]        ;  ebx=3*is
-       mov   [esp + .is3],ebx          ;  store is3
-
-       mov   eax, [ebp + %$shiftvec]   ;  eax = base of shiftvec[] 
-
-       movss xmm0, [eax + ebx*4]
-       movss xmm1, [eax + ebx*4 + 4]
-       movss xmm2, [eax + ebx*4 + 8] 
-
-       mov   ecx, [ebp + %$iinr]       ;  ecx = pointer into iinr[]    
-       add   [ebp + %$iinr], dword 4   ;  advance pointer
-       mov   ebx, [ecx]                ;  ebx =ii
-
-       lea   ebx, [ebx + ebx*2]        ;  ebx = 3*ii=ii3
-       mov   eax, [ebp + %$pos]        ;  eax = base of pos[]
-       mov   [esp + .ii3], ebx 
-       
-       movaps xmm3, xmm0
-       movaps xmm4, xmm1
-       movaps xmm5, xmm2
-       addss xmm3, [eax + ebx*4]
-       addss xmm4, [eax + ebx*4 + 4]
-       addss xmm5, [eax + ebx*4 + 8]           
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixO], xmm3
-       movaps [esp + .iyO], xmm4
-       movaps [esp + .izO], xmm5
-
-       movss xmm3, xmm0
-       movss xmm4, xmm1
-       movss xmm5, xmm2
-       addss xmm0, [eax + ebx*4 + 12]
-       addss xmm1, [eax + ebx*4 + 16]
-       addss xmm2, [eax + ebx*4 + 20]          
-       addss xmm3, [eax + ebx*4 + 24]
-       addss xmm4, [eax + ebx*4 + 28]
-       addss xmm5, [eax + ebx*4 + 32]          
-
-       shufps xmm0, xmm0, 0b
-       shufps xmm1, xmm1, 0b
-       shufps xmm2, xmm2, 0b
-       shufps xmm3, xmm3, 0b
-       shufps xmm4, xmm4, 0b
-       shufps xmm5, xmm5, 0b
-       movaps [esp + .ixH1], xmm0
-       movaps [esp + .iyH1], xmm1
-       movaps [esp + .izH1], xmm2
-       movaps [esp + .ixH2], xmm3
-       movaps [esp + .iyH2], xmm4
-       movaps [esp + .izH2], xmm5
-
-       ; clear vctot and i forces
-       xorps xmm4, xmm4
-       movaps [esp + .vctot], xmm4
-       movaps [esp + .vnbtot], xmm4
-       movaps [esp + .fixO], xmm4
-       movaps [esp + .fiyO], xmm4
-       movaps [esp + .fizO], xmm4
-       movaps [esp + .fixH1], xmm4
-       movaps [esp + .fiyH1], xmm4
-       movaps [esp + .fizH1], xmm4
-       movaps [esp + .fixH2], xmm4
-       movaps [esp + .fiyH2], xmm4
-       movaps [esp + .fizH2], xmm4
-       
-       mov   eax, [ebp + %$jindex]
-       mov   ecx, [eax]                 ;  jindex[n]
-       mov   edx, [eax + 4]             ;  jindex[n+1]
-       add   [ebp + %$jindex], dword 4
-       sub   edx, ecx                   ;  number of innerloop atoms
-
-       mov   esi, [ebp + %$pos]
-       mov   edi, [ebp + %$faction]    
-       mov   eax, [ebp + %$jjnr]
-       shl   ecx, 2
-       add   eax, ecx
-       mov   [esp + .innerjjnr], eax     ;  pointer to jjnr[nj0]
-       sub   edx, dword 4
-       mov   [esp + .innerk], edx        ;  number of innerloop atoms
-       jge   .unroll_loop
-       jmp   .single_check
-.unroll_loop:  
-       ;; quad-unroll innerloop here.
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-
-       mov   eax, [edx]        
-       mov   ebx, [edx + 4] 
-       mov   ecx, [edx + 8]
-       mov   edx, [edx + 12]             ; eax-edx=jnr1-4 
-       
-       add   [esp + .innerjjnr], dword 16 ; advance pointer (unrolled 4) 
-
-       mov esi, [ebp + %$pos]        ; base of pos[]
-
-       lea   eax, [eax + eax*2]         ;  replace jnr with j3
-       lea   ebx, [ebx + ebx*2]        
-       lea   ecx, [ecx + ecx*2]         ;  replace jnr with j3
-       lea   edx, [edx + edx*2]        
-       
-       ; move j coordinates to local temp. variables
-       movlps xmm2, [esi + eax*4]
-       movlps xmm3, [esi + eax*4 + 12]
-       movlps xmm4, [esi + eax*4 + 24]
-
-       movlps xmm5, [esi + ebx*4]
-       movlps xmm6, [esi + ebx*4 + 12]
-       movlps xmm7, [esi + ebx*4 + 24]
-
-       movhps xmm2, [esi + ecx*4]
-       movhps xmm3, [esi + ecx*4 + 12]
-       movhps xmm4, [esi + ecx*4 + 24]
-
-       movhps xmm5, [esi + edx*4]
-       movhps xmm6, [esi + edx*4 + 12]
-       movhps xmm7, [esi + edx*4 + 24]
-
-       ;; current state:       
-       ;;  xmm2= jxOa  jyOa  jxOc  jyOc
-       ;;  xmm3= jxH1a jyH1a jxH1c jyH1c
-       ;;  xmm4= jxH2a jyH2a jxH2c jyH2c
-       ;;  xmm5= jxOb  jyOb  jxOd  jyOd
-       ;;  xmm6= jxH1b jyH1b jxH1d jyH1d
-       ;;  xmm7= jxH2b jyH2b jxH2d jyH2d
-       
-       movaps xmm0, xmm2
-       movaps xmm1, xmm3
-       unpcklps xmm0, xmm5     ; xmm0= jxOa  jxOb  jyOa  jyOb
-       unpcklps xmm1, xmm6     ; xmm1= jxH1a jxH1b jyH1a jyH1b
-       unpckhps xmm2, xmm5     ; xmm2= jxOc  jxOd  jyOc  jyOd
-       unpckhps xmm3, xmm6     ; xmm3= jxH1c jxH1d jyH1c jyH1d 
-       movaps xmm5, xmm4
-       movaps   xmm6, xmm0
-       unpcklps xmm4, xmm7     ; xmm4= jxH2a jxH2b jyH2a jyH2b         
-       unpckhps xmm5, xmm7     ; xmm5= jxH2c jxH2d jyH2c jyH2d
-       movaps   xmm7, xmm1
-       movlhps  xmm0, xmm2     ; xmm0= jxOa  jxOb  jxOc  jxOd 
-       movaps [esp + .jxO], xmm0
-       movhlps  xmm2, xmm6     ; xmm2= jyOa  jyOb  jyOc  jyOd
-       movaps [esp + .jyO], xmm2
-       movlhps  xmm1, xmm3
-       movaps [esp + .jxH1], xmm1
-       movhlps  xmm3, xmm7
-       movaps   xmm6, xmm4
-       movaps [esp + .jyH1], xmm3
-       movlhps  xmm4, xmm5
-       movaps [esp + .jxH2], xmm4
-       movhlps  xmm5, xmm6
-       movaps [esp + .jyH2], xmm5
-
-       movss  xmm0, [esi + eax*4 + 8]
-       movss  xmm1, [esi + eax*4 + 20]
-       movss  xmm2, [esi + eax*4 + 32]
-
-       movss  xmm3, [esi + ecx*4 + 8]
-       movss  xmm4, [esi + ecx*4 + 20]
-       movss  xmm5, [esi + ecx*4 + 32]
-
-       movhps xmm0, [esi + ebx*4 + 4]
-       movhps xmm1, [esi + ebx*4 + 16]
-       movhps xmm2, [esi + ebx*4 + 28]
-       
-       movhps xmm3, [esi + edx*4 + 4]
-       movhps xmm4, [esi + edx*4 + 16]
-       movhps xmm5, [esi + edx*4 + 28]
-       
-       shufps xmm0, xmm3, 11001100b
-       shufps xmm1, xmm4, 11001100b
-       shufps xmm2, xmm5, 11001100b
-       movaps [esp + .jzO],  xmm0
-       movaps [esp + .jzH1],  xmm1
-       movaps [esp + .jzH2],  xmm2
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixO]
-       movaps xmm4, [esp + .iyO]
-       movaps xmm5, [esp + .izO]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxOH1], xmm3
-       movaps [esp + .dyOH1], xmm4
-       movaps [esp + .dzOH1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOO], xmm0
-       movaps [esp + .rsqOH1], xmm3
-
-       movaps xmm0, [esp + .ixO]
-       movaps xmm1, [esp + .iyO]
-       movaps xmm2, [esp + .izO]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       subps  xmm3, [esp + .jxO]
-       subps  xmm4, [esp + .jyO]
-       subps  xmm5, [esp + .jzO]
-       movaps [esp + .dxOH2], xmm0
-       movaps [esp + .dyOH2], xmm1
-       movaps [esp + .dzOH2], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1O], xmm3
-       movaps [esp + .dyH1O], xmm4
-       movaps [esp + .dzH1O], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqOH2], xmm0
-       movaps [esp + .rsqH1O], xmm3
-
-       movaps xmm0, [esp + .ixH1]
-       movaps xmm1, [esp + .iyH1]
-       movaps xmm2, [esp + .izH1]
-       movaps xmm3, [esp + .ixH1]
-       movaps xmm4, [esp + .iyH1]
-       movaps xmm5, [esp + .izH1]
-       subps  xmm0, [esp + .jxH1]
-       subps  xmm1, [esp + .jyH1]
-       subps  xmm2, [esp + .jzH1]
-       subps  xmm3, [esp + .jxH2]
-       subps  xmm4, [esp + .jyH2]
-       subps  xmm5, [esp + .jzH2]
-       movaps [esp + .dxH1H1], xmm0
-       movaps [esp + .dyH1H1], xmm1
-       movaps [esp + .dzH1H1], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH1H2], xmm3
-       movaps [esp + .dyH1H2], xmm4
-       movaps [esp + .dzH1H2], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm3, xmm4
-       addps  xmm3, xmm5
-       movaps [esp + .rsqH1H1], xmm0
-       movaps [esp + .rsqH1H2], xmm3
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       movaps xmm3, [esp + .ixH2]
-       movaps xmm4, [esp + .iyH2]
-       movaps xmm5, [esp + .izH2]
-       subps  xmm0, [esp + .jxO]
-       subps  xmm1, [esp + .jyO]
-       subps  xmm2, [esp + .jzO]
-       subps  xmm3, [esp + .jxH1]
-       subps  xmm4, [esp + .jyH1]
-       subps  xmm5, [esp + .jzH1]
-       movaps [esp + .dxH2O], xmm0
-       movaps [esp + .dyH2O], xmm1
-       movaps [esp + .dzH2O], xmm2
-       mulps  xmm0, xmm0
-       mulps  xmm1, xmm1
-       mulps  xmm2, xmm2
-       movaps [esp + .dxH2H1], xmm3
-       movaps [esp + .dyH2H1], xmm4
-       movaps [esp + .dzH2H1], xmm5
-       mulps  xmm3, xmm3
-       mulps  xmm4, xmm4
-       mulps  xmm5, xmm5
-       addps  xmm0, xmm1
-       addps  xmm0, xmm2
-       addps  xmm4, xmm3
-       addps  xmm4, xmm5
-       movaps [esp + .rsqH2O], xmm0
-       movaps [esp + .rsqH2H1], xmm4
-
-       movaps xmm0, [esp + .ixH2]
-       movaps xmm1, [esp + .iyH2]
-       movaps xmm2, [esp + .izH2]
-       subps  xmm0, [esp + .jxH2]
-       subps  xmm1, [esp + .jyH2]
-       subps  xmm2, [esp + .jzH2]
-       movaps [esp + .dxH2H2], xmm0
-       movaps [esp + .dyH2H2], xmm1
-       movaps [esp + .dzH2H2], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2
-       movaps [esp + .rsqH2H2], xmm0
-               
-       ; start doing invsqrt. use rsq values in xmm0, xmm4
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinvH2H2
-       mulps   xmm7, [esp + .half] ; rinvH2H1
-       movaps  [esp + .rinvH2H2], xmm3
-       movaps  [esp + .rinvH2H1], xmm7
-               
-       rsqrtps xmm1, [esp + .rsqOO]
-       rsqrtps xmm5, [esp + .rsqOH1]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOO]
-       mulps   xmm5, [esp + .rsqOH1]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOO], xmm3
-       movaps  [esp + .rinvOH1], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqOH2]
-       rsqrtps xmm5, [esp + .rsqH1O]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqOH2]
-       mulps   xmm5, [esp + .rsqH1O]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvOH2], xmm3
-       movaps  [esp + .rinvH1O], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH1H1]
-       rsqrtps xmm5, [esp + .rsqH1H2]
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, [esp + .rsqH1H1]
-       mulps   xmm5, [esp + .rsqH1H2]
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] 
-       mulps   xmm7, [esp + .half]
-       movaps  [esp + .rinvH1H1], xmm3
-       movaps  [esp + .rinvH1H2], xmm7
-       
-       rsqrtps xmm1, [esp + .rsqH2O]
-       movaps  xmm2, xmm1
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, [esp + .rsqH2O]
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2
-       mulps   xmm3, [esp + .half] 
-       movaps  [esp + .rinvH2O], xmm3
-
-       ;; start with OO interaction.
-       movaps xmm0, [esp + .rinvOO]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOO] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-               
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-       
-        movd mm0, eax
-        movd mm1, ebx
-        movd mm2, ecx
-        movd mm3, edx
-
-        mov  esi, [ebp + %$VFtab]
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOO]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-        ; increment vcoul - then we can get rid of mm5.
-        ;; update vctot
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5 
-
-        ; put scalar force on stack temporarily...
-        movaps [esp + .fstmp], xmm3
-
-        ; dispersion
-        movlps xmm5, [esi + eax*4 + 16]
-        movlps xmm7, [esi + ecx*4 + 16]
-        movhps xmm5, [esi + ebx*4 + 16]
-        movhps xmm7, [esi + edx*4 + 16] ; got half dispersion table
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 24]
-        movlps xmm3, [esi + ecx*4 + 24]
-        movhps xmm7, [esi + ebx*4 + 24]
-        movhps xmm3, [esi + edx*4 + 24] ; other half of dispersion table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; dispersion table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-
-        movaps xmm4, [esp + .c6]
-        mulps  xmm7, xmm4        ; fijD
-        mulps  xmm5, xmm4        ; vnb6
-        addps  xmm7, [esp + .fstmp] ; add to fscal
-
-        ; put scalar force on stack. Update vnbtot directly.
-        addps  xmm5, [esp + .vnbtot]
-        movaps [esp + .fstmp], xmm7
-        movaps [esp + .vnbtot], xmm5
-
-        ; repulsion
-        movlps xmm5, [esi + eax*4 + 32]
-        movlps xmm7, [esi + ecx*4 + 32]
-        movhps xmm5, [esi + ebx*4 + 32]
-        movhps xmm7, [esi + edx*4 + 32] ; got half repulsion table
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 40]
-        movlps xmm3, [esi + ecx*4 + 40]
-        movhps xmm7, [esi + ebx*4 + 40]
-        movhps xmm3, [esi + edx*4 + 40] ; other half of repulsion table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
- 
-        movaps xmm4, [esp + .c12]
-        mulps  xmm7, xmm4 ; fijR
-        mulps  xmm5, xmm4 ; vnb12
-        addps  xmm7, [esp + .fstmp] 
-
-        addps  xmm5, [esp + .vnbtot]
-        movaps [esp + .vnbtot], xmm5
-        xorps  xmm1, xmm1
-
-        mulps xmm7, [esp + .tabscale]
-        mulps xmm7, xmm0
-        subps  xmm1, xmm7
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1               
-
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOO]
-       mulps xmm1, [esp + .dyOO]
-       mulps xmm2, [esp + .dzOO]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H1 interaction
-       movaps xmm0, [esp + .rinvOH1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOH1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH1]
-       mulps xmm1, [esp + .dyOH1]
-       mulps xmm2, [esp + .dzOH1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; O-H2 interaction
-       movaps xmm0, [esp + .rinvOH2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqOH2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       xorps xmm3, xmm3
-       movaps xmm4, xmm3
-       movaps xmm5, xmm3
-       mulps xmm0, [esp + .dxOH2]
-       mulps xmm1, [esp + .dyOH2]
-       mulps xmm2, [esp + .dzOH2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixO]
-       addps xmm1, [esp + .fiyO]
-       addps xmm2, [esp + .fizO]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixO], xmm0
-       movaps [esp + .fiyO], xmm1
-       movaps [esp + .fizO], xmm2
-
-       ; H1-O interaction
-       movaps xmm0, [esp + .rinvH1O]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1O] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH1O]
-       mulps xmm1, [esp + .dyH1O]
-       mulps xmm2, [esp + .dzH1O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H1 interaction
-       movaps xmm0, [esp + .rinvH1H1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1H1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH1H1]
-       mulps xmm1, [esp + .dyH1H1]
-       mulps xmm2, [esp + .dzH1H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H1-H2 interaction
-       movaps xmm0, [esp + .rinvH1H2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH1H2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH1H2]
-       mulps xmm1, [esp + .dyH1H2]
-       mulps xmm2, [esp + .dzH1H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH1]
-       addps xmm1, [esp + .fiyH1]
-       addps xmm2, [esp + .fizH1]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH1], xmm0
-       movaps [esp + .fiyH1], xmm1
-       movaps [esp + .fizH1], xmm2
-
-       ; H2-O interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2O] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqOH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-
-       movaps xmm3, [esp + .fjxO]
-       movaps xmm4, [esp + .fjyO]
-       movaps xmm5, [esp + .fjzO]
-       mulps xmm0, [esp + .dxH2O]
-       mulps xmm1, [esp + .dyH2O]
-       mulps xmm2, [esp + .dzH2O]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxO], xmm3
-       movaps [esp + .fjyO], xmm4
-       movaps [esp + .fjzO], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H1 interaction
-       movaps xmm0, [esp + .rinvH2H1]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2H1] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH1]
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       mulps xmm0, [esp + .dxH2H1]
-       mulps xmm1, [esp + .dyH2H1]
-       mulps xmm2, [esp + .dzH2H1]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH1], xmm3
-       movaps [esp + .fjyH1], xmm4
-       movaps [esp + .fjzH1], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       ; H2-H2 interaction
-       movaps xmm0, [esp + .rinvH2H2]
-       movaps xmm1, xmm0
-       mulps  xmm1, [esp + .rsqH2H2] ; xmm1=r
-       mulps  xmm1, [esp + .tabscale]  
-       movhlps xmm2, xmm1
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ;xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-
-        movd eax, mm6
-        psrlq mm6, 32
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd ebx, mm6
-        movd edx, mm7
-
-        lea   eax, [eax + eax*2]
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-
-        movlps xmm5, [esi + eax*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm5, [esi + ebx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + eax*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm7, [esi + ebx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-        movaps xmm3, [esp + .qqHH]
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point mm5 contains vcoul and mm3 fijC.
-
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-       xorps  xmm1, xmm1
-       mulps  xmm3,  [esp + .tabscale]
-       mulps  xmm3, xmm0
-       subps  xmm1, xmm3
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1
-       
-       movaps xmm3, [esp + .fjxH2]
-       movaps xmm4, [esp + .fjyH2]
-       movaps xmm5, [esp + .fjzH2]
-       mulps xmm0, [esp + .dxH2H2]
-       mulps xmm1, [esp + .dyH2H2]
-       mulps xmm2, [esp + .dzH2H2]
-       subps xmm3, xmm0
-       subps xmm4, xmm1
-       subps xmm5, xmm2
-       addps xmm0, [esp + .fixH2]
-       addps xmm1, [esp + .fiyH2]
-       addps xmm2, [esp + .fizH2]
-       movaps [esp + .fjxH2], xmm3
-       movaps [esp + .fjyH2], xmm4
-       movaps [esp + .fjzH2], xmm5
-       movaps [esp + .fixH2], xmm0
-       movaps [esp + .fiyH2], xmm1
-       movaps [esp + .fizH2], xmm2
-
-       mov edi, [ebp +%$faction]
-
-       movd eax, mm0
-       movd ebx, mm1
-       movd ecx, mm2
-       movd edx, mm3
-       
-       ; Did all interactions - now update j forces.
-       ; 4 j waters with three atoms each - first do a & b j particles
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd 
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpcklps xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjxOb  fjyOb
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOb  fjyOb
-       unpcklps xmm1, xmm2        ; xmm1= fjzOa  fjxH1a fjzOb  fjxH1b
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpcklps xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjyH1b fjzH1b
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1b fjzH1b
-       unpcklps xmm5, xmm6        ; xmm5= fjxH2a fjyH2a fjxH2b fjyH2b
-       movlhps  xmm0, xmm1        ; xmm0= fjxOa  fjyOa  fjzOa  fjxH1a  
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOb  fjyOb  fjzOb  fjxH1b
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1a fjzH1a fjxH2a fjyH2a
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1b fjzH1b fjxH2b fjyH2b
-       movups   xmm1, [edi + eax*4]
-       movups   xmm2, [edi + eax*4 + 16]
-       movups   xmm5, [edi + ebx*4]
-       movups   xmm6, [edi + ebx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + eax*4 + 32]
-       movss    xmm3, [edi + ebx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm7, xmm7, 1b
-       
-       movups   [edi + eax*4],     xmm1
-       movups   [edi + eax*4 + 16],xmm2
-       movups   [edi + ebx*4],     xmm5
-       movups   [edi + ebx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + eax*4 + 32], xmm0
-       movss    [edi + ebx*4 + 32], xmm3       
-
-       ;; then do the second pair (c & d)
-       movaps xmm0, [esp + .fjxO] ; xmm0= fjxOa  fjxOb  fjxOc  fjxOd
-       movaps xmm1, [esp + .fjyO] ; xmm1= fjyOa  fjyOb  fjyOc  fjyOd 
-       unpckhps xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjxOd  fjyOd
-       movaps xmm1, [esp + .fjzO]
-       movaps xmm2, [esp + .fjxH1]
-       movhlps  xmm3, xmm0        ; xmm3= fjxOd  fjyOd
-       unpckhps xmm1, xmm2        ; xmm1= fjzOc  fjxH1c fjzOd  fjxH1d
-       movaps xmm4, [esp + .fjyH1]
-       movaps xmm5, [esp + .fjzH1]
-       unpckhps xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjyH1d fjzH1d  
-       movaps xmm5, [esp + .fjxH2]
-       movaps xmm6, [esp + .fjyH2]
-       movhlps  xmm7, xmm4        ; xmm7= fjyH1d fjzH1d         
-       unpckhps xmm5, xmm6        ; xmm5= fjxH2c fjyH2c fjxH2d fjyH2d
-       movlhps  xmm0, xmm1        ; xmm0= fjxOc  fjyOc  fjzOc  fjxH1c 
-       shufps   xmm3, xmm1, 11100100b
-                                   ; xmm3= fjxOd  fjyOd  fjzOd  fjxH1d
-       movlhps  xmm4, xmm5        ; xmm4= fjyH1c fjzH1c fjxH2c fjyH2c 
-       shufps   xmm7, xmm5, 11100100b
-                                   ; xmm7= fjyH1d fjzH1d fjxH2d fjyH2d
-       movups   xmm1, [edi + ecx*4]
-       movups   xmm2, [edi + ecx*4 + 16]
-       movups   xmm5, [edi + edx*4]
-       movups   xmm6, [edi + edx*4 + 16]
-       addps    xmm1, xmm0
-       addps    xmm2, xmm4
-       addps    xmm5, xmm3
-       addps    xmm6, xmm7
-       movss    xmm0, [edi + ecx*4 + 32]
-       movss    xmm3, [edi + edx*4 + 32]
-       
-       movaps   xmm4, [esp + .fjzH2]
-       movaps   xmm7, xmm4
-       shufps   xmm4, xmm4, 10b
-       shufps   xmm7, xmm7, 11b
-       movups   [edi + ecx*4],     xmm1
-       movups   [edi + ecx*4 + 16],xmm2
-       movups   [edi + edx*4],     xmm5
-       movups   [edi + edx*4 + 16],xmm6        
-       addss    xmm0, xmm4
-       addss    xmm3, xmm7
-       movss    [edi + ecx*4 + 32], xmm0
-       movss    [edi + edx*4 + 32], xmm3       
-       
-       ;; should we do one more iteration?
-       sub   [esp + .innerk], dword 4
-       jl    .single_check
-       jmp   .unroll_loop
-.single_check:
-       add   [esp + .innerk], dword 4
-       jnz   .single_loop
-       jmp   .updateouterdata
-.single_loop:
-       mov   edx, [esp + .innerjjnr]     ; pointer to jjnr[k] 
-       mov   eax, [edx]        
-       add   [esp + .innerjjnr], dword 4       
-
-       mov esi, [ebp + %$pos]
-       lea   eax, [eax + eax*2]  
-
-       ; fetch j coordinates
-       xorps xmm3, xmm3
-       xorps xmm4, xmm4
-       xorps xmm5, xmm5
-       movss xmm3, [esi + eax*4]
-       movss xmm4, [esi + eax*4 + 4]
-       movss xmm5, [esi + eax*4 + 8]
-
-       movlps xmm6, [esi + eax*4 + 12]
-       movhps xmm6, [esi + eax*4 + 24] ; xmm6=jxH1 jyH1 jxH2 jyH2
-       ;;  fetch both z coords in one go, to positions 0 and 3 in xmm7
-       movups xmm7, [esi + eax*4 + 20] ; xmm7=jzH1 jxH2 jyH2 jzH2
-       shufps xmm6, xmm6, 11011000b    ;  xmm6=jxH1 jxH2 jyH1 jyH2
-       movlhps xmm3, xmm6              ; xmm3= jxO   0  jxH1 jxH2 
-       movaps  xmm0, [esp + .ixO]     
-       movaps  xmm1, [esp + .iyO]
-       movaps  xmm2, [esp + .izO]      
-       shufps  xmm4, xmm6, 11100100b ;  xmm4= jyO   0   jyH1 jyH2
-       shufps xmm5, xmm7, 11000100b  ;  xmm5= jzO   0   jzH1 jzH2  
-       ;;  store all j coordinates in jO
-       movaps [esp + .jxO], xmm3
-       movaps [esp + .jyO], xmm4
-       movaps [esp + .jzO], xmm5
-       subps  xmm0, xmm3
-       subps  xmm1, xmm4
-       subps  xmm2, xmm5
-       movaps [esp + .dxOO], xmm0
-       movaps [esp + .dyOO], xmm1
-       movaps [esp + .dzOO], xmm2
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       addps xmm0, xmm1
-       addps xmm0, xmm2        ;  have rsq in xmm0.
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       movaps  xmm2, xmm1      
-       mulps   xmm1, xmm1
-       movaps  xmm3, [esp + .three]
-       mulps   xmm1, xmm0
-       subps   xmm3, xmm1
-       mulps   xmm3, xmm2                                                      
-       mulps   xmm3, [esp + .half] ; rinv iO - j water
-
-       movaps  xmm1, xmm3
-       mulps   xmm1, xmm0      ; xmm1=r
-       movaps  xmm0, xmm3      ; xmm0=rinv
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-       mov esi, [ebp + %$VFtab]
-       
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-       
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOO]
-       movhps  xmm3, [esp + .qqOH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-       
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5
-        ; put scalar force on stack temporarily...
-        movaps [esp + .fstmp], xmm3
-
-        ; dispersion
-       movss  xmm4, [esi + ebx*4 + 16] 
-       movss  xmm5, [esi + ebx*4 + 20] 
-       movss  xmm6, [esi + ebx*4 + 24] 
-       movss  xmm7, [esi + ebx*4 + 28]
-        ; dispersion table ready, in xmm4-xmm7 
-        mulss  xmm6, xmm1       ; xmm6=Geps
-        mulss  xmm7, xmm2       ; xmm7=Heps2
-        addss  xmm5, xmm6
-        addss  xmm5, xmm7       ; xmm5=Fp
-        mulss  xmm7, [esp + .two]       ; two*Heps2
-        addss  xmm7, xmm6
-        addss  xmm7, xmm5 ; xmm7=FF
-        mulss  xmm5, xmm1 ; xmm5=eps*Fp
-        addss  xmm5, xmm4 ; xmm5=VV
-       xorps  xmm4, xmm4
-        movss  xmm4, [esp + .c6]
-        mulps  xmm7, xmm4        ; fijD
-        mulps  xmm5, xmm4        ; vnb6
-        addps  xmm7, [esp + .fstmp] ; add to fscal
-
-        ; put scalar force on stack. Update vnbtot directly.
-        addps  xmm5, [esp + .vnbtot]
-        movaps [esp + .fstmp], xmm7
-        movaps [esp + .vnbtot], xmm5
-
-        ; repulsion
-       movss  xmm4, [esi + ebx*4 + 32] 
-       movss  xmm5, [esi + ebx*4 + 36] 
-       movss  xmm6, [esi + ebx*4 + 40] 
-       movss  xmm7, [esi + ebx*4 + 44]
-        ; table ready, in xmm4-xmm7 
-        mulss  xmm6, xmm1       ; xmm6=Geps
-        mulss  xmm7, xmm2       ; xmm7=Heps2
-        addss  xmm5, xmm6
-        addss  xmm5, xmm7       ; xmm5=Fp
-        mulss  xmm7, [esp + .two]       ; two*Heps2
-        addss  xmm7, xmm6
-        addss  xmm7, xmm5 ; xmm7=FF
-        mulss  xmm5, xmm1 ; xmm5=eps*Fp
-        addss  xmm5, xmm4 ; xmm5=VV
-
-       xorps  xmm4, xmm4
-        movss  xmm4, [esp + .c12]
-        mulps  xmm7, xmm4 ; fijR
-        mulps  xmm5, xmm4 ; vnb12
-        addps  xmm7, [esp + .fstmp] 
-
-        addps  xmm5, [esp + .vnbtot]
-        movaps [esp + .vnbtot], xmm5
-        xorps  xmm1, xmm1
-
-        mulps xmm7, [esp + .tabscale]
-        mulps xmm7, xmm0
-        subps  xmm1, xmm7
-
-       movaps xmm0, xmm1
-       movaps xmm2, xmm1               
-
-       mulps   xmm0, [esp + .dxOO]
-       mulps   xmm1, [esp + .dyOO]
-       mulps   xmm2, [esp + .dzOO]
-       ;; initial update for j forces
-       xorps   xmm3, xmm3
-       xorps   xmm4, xmm4
-       xorps   xmm5, xmm5
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixO]
-       addps   xmm1, [esp + .fiyO]
-       addps   xmm2, [esp + .fizO]
-       movaps  [esp + .fixO], xmm0
-       movaps  [esp + .fiyO], xmm1
-       movaps  [esp + .fizO], xmm2
-
-       
-       ;;  done with i O. Now do i H1 & H2 simultaneously. first get i particle coords:
-       movaps  xmm0, [esp + .ixH1]
-       movaps  xmm1, [esp + .iyH1]
-       movaps  xmm2, [esp + .izH1]     
-       movaps  xmm3, [esp + .ixH2] 
-       movaps  xmm4, [esp + .iyH2] 
-       movaps  xmm5, [esp + .izH2] 
-       subps   xmm0, [esp + .jxO]
-       subps   xmm1, [esp + .jyO]
-       subps   xmm2, [esp + .jzO]
-       subps   xmm3, [esp + .jxO]
-       subps   xmm4, [esp + .jyO]
-       subps   xmm5, [esp + .jzO]
-       movaps [esp + .dxH1O], xmm0
-       movaps [esp + .dyH1O], xmm1
-       movaps [esp + .dzH1O], xmm2
-       movaps [esp + .dxH2O], xmm3
-       movaps [esp + .dyH2O], xmm4
-       movaps [esp + .dzH2O], xmm5
-       mulps xmm0, xmm0
-       mulps xmm1, xmm1
-       mulps xmm2, xmm2
-       mulps xmm3, xmm3
-       mulps xmm4, xmm4
-       mulps xmm5, xmm5
-       addps xmm0, xmm1
-       addps xmm4, xmm3
-       addps xmm0, xmm2        ;  have rsqH1 in xmm0.
-       addps xmm4, xmm5        ;  have rsqH2 in xmm4.
-
-       ;;  start with H1, save H2 data
-       movaps [esp + .rsqH2O], xmm4
-       
-       ;;  do invsqrt
-       rsqrtps xmm1, xmm0
-       rsqrtps xmm5, xmm4
-       movaps  xmm2, xmm1
-       movaps  xmm6, xmm5
-       mulps   xmm1, xmm1
-       mulps   xmm5, xmm5
-       movaps  xmm3, [esp + .three]
-       movaps  xmm7, xmm3
-       mulps   xmm1, xmm0
-       mulps   xmm5, xmm4
-       subps   xmm3, xmm1
-       subps   xmm7, xmm5
-       mulps   xmm3, xmm2
-       mulps   xmm7, xmm6
-       mulps   xmm3, [esp + .half] ; rinv H1 - j water
-       mulps   xmm7, [esp + .half] ; rinv H2 - j water
-
-       ;;  start with H1, save H2 data
-       movaps [esp + .rinvH2O], xmm7
-
-       movaps xmm1, xmm3
-       mulps  xmm1, xmm0       ;  xmm1=r
-       movaps xmm0, xmm3       ;  xmm0=rinv
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-       
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOH]
-       movhps  xmm3, [esp + .qqHH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5    
-
-        xorps  xmm1, xmm1
-
-        mulps xmm3, [esp + .tabscale]
-        mulps xmm3, xmm0
-        subps  xmm1, xmm3
-       
-       movaps  xmm0, xmm1
-       movaps  xmm2, xmm1
-       mulps   xmm0, [esp + .dxH1O]
-       mulps   xmm1, [esp + .dyH1O]
-       mulps   xmm2, [esp + .dzH1O]
-       ;;  update forces H1 - j water
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH1]
-       addps   xmm1, [esp + .fiyH1]
-       addps   xmm2, [esp + .fizH1]
-       movaps  [esp + .fixH1], xmm0
-       movaps  [esp + .fiyH1], xmm1
-       movaps  [esp + .fizH1], xmm2
-       ;; do table for H2 - j water interaction
-       movaps xmm0, [esp + .rinvH2O]
-       movaps xmm1, [esp + .rsqH2O]
-       mulps  xmm1, xmm0       ; xmm0=rinv, xmm1=r
-       mulps  xmm1, [esp + .tabscale]
-       
-       movhlps xmm2, xmm1      
-        cvttps2pi mm6, xmm1
-        cvttps2pi mm7, xmm2     ; mm6/mm7 contain lu indices
-        cvtpi2ps xmm3, mm6
-        cvtpi2ps xmm2, mm7
-       movlhps  xmm3, xmm2
-       subps    xmm1, xmm3     ;  xmm1=eps
-        movaps xmm2, xmm1
-        mulps  xmm2, xmm2       ; xmm2=eps2
-        pslld mm6, 2
-        pslld mm7, 2
-        movd ebx, mm6
-        movd ecx, mm7
-        psrlq mm7, 32
-        movd edx, mm7          ; table indices in ebx,ecx,edx
-
-        lea   ebx, [ebx + ebx*2]
-        lea   ecx, [ecx + ecx*2]
-        lea   edx, [edx + edx*2]
-       
-        movlps xmm5, [esi + ebx*4]
-        movlps xmm7, [esi + ecx*4]
-        movhps xmm7, [esi + edx*4] ; got half coulomb table 
-        movaps xmm4, xmm5
-        shufps xmm4, xmm7, 10001000b
-        shufps xmm5, xmm7, 11011101b 
-
-        movlps xmm7, [esi + ebx*4 + 8]
-        movlps xmm3, [esi + ecx*4 + 8]
-        movhps xmm3, [esi + edx*4 + 8] ; other half of coulomb table
-        movaps xmm6, xmm7
-        shufps xmm6, xmm3, 10001000b
-        shufps xmm7, xmm3, 11011101b 
-        ; coulomb table ready, in xmm4-xmm7 
-        mulps  xmm6, xmm1       ; xmm6=Geps
-        mulps  xmm7, xmm2       ; xmm7=Heps2
-        addps  xmm5, xmm6
-        addps  xmm5, xmm7       ; xmm5=Fp
-        mulps  xmm7, [esp + .two]       ; two*Heps2
-
-       xorps  xmm3, xmm3
-       ;; fetch charges to xmm3 (temporary)
-       movss   xmm3, [esp + .qqOH]
-       movhps  xmm3, [esp + .qqHH]
-               
-        addps  xmm7, xmm6
-        addps  xmm7, xmm5 ; xmm7=FF
-        mulps  xmm5, xmm1 ; xmm5=eps*Fp
-        addps  xmm5, xmm4 ; xmm5=VV
-        mulps  xmm5, xmm3 ; vcoul=qq*VV
-        mulps  xmm3, xmm7 ; fijC=FF*qq
-        ; at this point xmm5 contains vcoul and xmm3 fijC.
-        addps  xmm5, [esp + .vctot]
-        movaps [esp + .vctot], xmm5    
-
-        xorps  xmm1, xmm1
-
-        mulps xmm3, [esp + .tabscale]
-        mulps xmm3, xmm0
-        subps  xmm1, xmm3
-       
-       movaps  xmm0, xmm1
-       movaps  xmm2, xmm1
-       
-       mulps   xmm0, [esp + .dxH2O]
-       mulps   xmm1, [esp + .dyH2O]
-       mulps   xmm2, [esp + .dzH2O]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       subps   xmm3, xmm0
-       subps   xmm4, xmm1
-       subps   xmm5, xmm2
-       mov     esi, [ebp + %$faction]
-       movaps  [esp + .fjxO], xmm3
-       movaps  [esp + .fjyO], xmm4
-       movaps  [esp + .fjzO], xmm5
-       addps   xmm0, [esp + .fixH2]
-       addps   xmm1, [esp + .fiyH2]
-       addps   xmm2, [esp + .fizH2]
-       movaps  [esp + .fixH2], xmm0
-       movaps  [esp + .fiyH2], xmm1
-       movaps  [esp + .fizH2], xmm2
-
-       ;; update j water forces from local variables
-       movlps  xmm0, [esi + eax*4]
-       movlps  xmm1, [esi + eax*4 + 12]
-       movhps  xmm1, [esi + eax*4 + 24]
-       movaps  xmm3, [esp + .fjxO]
-       movaps  xmm4, [esp + .fjyO]
-       movaps  xmm5, [esp + .fjzO]
-       movaps  xmm6, xmm5
-       movaps  xmm7, xmm5
-       shufps  xmm6, xmm6, 10b
-       shufps  xmm7, xmm7, 11b
-       addss   xmm5, [esi + eax*4 + 8]
-       addss   xmm6, [esi + eax*4 + 20]
-       addss   xmm7, [esi + eax*4 + 32]
-       movss   [esi + eax*4 + 8], xmm5
-       movss   [esi + eax*4 + 20], xmm6
-       movss   [esi + eax*4 + 32], xmm7
-       movaps   xmm5, xmm3
-       unpcklps xmm3, xmm4
-       unpckhps xmm5, xmm4
-       addps    xmm0, xmm3
-       addps    xmm1, xmm5
-       movlps  [esi + eax*4], xmm0 
-       movlps  [esi + eax*4 + 12], xmm1 
-       movhps  [esi + eax*4 + 24], xmm1 
-       
-       dec   dword [esp + .innerk]
-       jz    .updateouterdata
-       jmp   .single_loop
-.updateouterdata:
-       mov   ecx, [esp + .ii3]
-       mov   edi, [ebp + %$faction]
-       mov   esi, [ebp + %$fshift]
-       mov   edx, [esp + .is3]
-
-       ; accumulate Oi forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixO]
-       movaps xmm1, [esp + .fiyO] 
-       movaps xmm2, [esp + .fizO]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4]
-       movss  xmm4, [edi + ecx*4 + 4]
-       movss  xmm5, [edi + ecx*4 + 8]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4],     xmm3
-       movss  [edi + ecx*4 + 4], xmm4
-       movss  [edi + ecx*4 + 8], xmm5
-
-       ; accumulate force in xmm6/xmm7 for fshift
-       movaps xmm6, xmm0
-       movss xmm7, xmm2
-       movlhps xmm6, xmm1
-       shufps  xmm6, xmm6, 1000b       
-
-       ; accumulate H1i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH1]
-       movaps xmm1, [esp + .fiyH1]
-       movaps xmm2, [esp + .fizH1]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 12]
-       movss  xmm4, [edi + ecx*4 + 16]
-       movss  xmm5, [edi + ecx*4 + 20]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 12], xmm3
-       movss  [edi + ecx*4 + 16], xmm4
-       movss  [edi + ecx*4 + 20], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; accumulate H2i forces in xmm0, xmm1, xmm2
-       movaps xmm0, [esp + .fixH2]
-       movaps xmm1, [esp + .fiyH2]
-       movaps xmm2, [esp + .fizH2]
-
-       movhlps xmm3, xmm0
-       movhlps xmm4, xmm1
-       movhlps xmm5, xmm2
-       addps  xmm0, xmm3
-       addps  xmm1, xmm4
-       addps  xmm2, xmm5 ; summan ligger i 1/2 i xmm0-xmm2
-
-       movaps xmm3, xmm0       
-       movaps xmm4, xmm1       
-       movaps xmm5, xmm2       
-
-       shufps xmm3, xmm3, 1b
-       shufps xmm4, xmm4, 1b
-       shufps xmm5, xmm5, 1b
-       addss  xmm0, xmm3
-       addss  xmm1, xmm4
-       addss  xmm2, xmm5       ; xmm0-xmm2 has single force in pos0.
-
-       ; increment i force
-       movss  xmm3, [edi + ecx*4 + 24]
-       movss  xmm4, [edi + ecx*4 + 28]
-       movss  xmm5, [edi + ecx*4 + 32]
-       addss  xmm3, xmm0
-       addss  xmm4, xmm1
-       addss  xmm5, xmm2
-       movss  [edi + ecx*4 + 24], xmm3
-       movss  [edi + ecx*4 + 28], xmm4
-       movss  [edi + ecx*4 + 32], xmm5
-
-       ;accumulate force in xmm6/xmm7 for fshift
-       addss xmm7, xmm2
-       movlhps xmm0, xmm1
-       shufps  xmm0, xmm0, 1000b       
-       addps   xmm6, xmm0
-
-       ; increment fshift force
-       movlps  xmm3, [esi + edx*4]
-       movss  xmm4, [esi + edx*4 + 8]
-       addps  xmm3, xmm6
-       addss  xmm4, xmm7
-       movlps  [esi + edx*4],    xmm3
-       movss  [esi + edx*4 + 8], xmm4
-
-       ; get group index for i particle
-       mov   edx, [ebp + %$gid]      ; get group index for this i particle
-       mov   edx, [edx]
-       add   [ebp + %$gid], dword 4  ;  advance pointer
-
-       ; accumulate total potential energy and update it.
-       movaps xmm7, [esp + .vctot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vc]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ; accumulate total lj energy and update it.
-       movaps xmm7, [esp + .vnbtot]
-       ; accumulate
-       movhlps xmm6, xmm7
-       addps  xmm7, xmm6       ; pos 0-1 in xmm7 have the sum now
-       movaps xmm6, xmm7
-       shufps xmm6, xmm6, 1b
-       addss  xmm7, xmm6               
-
-       ; add earlier value from mem.
-       mov   eax, [ebp + %$Vnb]
-       addss xmm7, [eax + edx*4] 
-       ; move back to mem.
-       movss [eax + edx*4], xmm7 
-       
-       ;; finish if last
-       mov   ecx, [ebp + %$nri]
-       dec ecx
-       jecxz .end
-       ;;  not last, iterate once more!
-       mov [ebp + %$nri], ecx
-       jmp .outer
-.end:
-       emms
-       mov eax, [esp + .salign]
-       add esp, eax
-       add esp, 1508
-       pop edi
-       pop esi
-        pop edx
-        pop ecx
-        pop ebx
-        pop eax
-       endproc
-
-
diff --git a/src/kernel/Makefile.am b/src/kernel/Makefile.am

index 0c058ef272bb3e8d3b023b68e3d9bbe14d49a920..288d92218730547ede2b2101fcc0711956495ae0 100644 (file)
--- a/src/kernel/Makefile.am
+++ b/src/kernel/Makefile.am
@@ -9,8 +9,7 @@ include $(srcdir)/../Makefile.inc
  # but it can probably be done in a nicer way...
  INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
  LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
-
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
  
  bin_PROGRAMS = \
         gromppXXX_SUFFIX_XXX            mdrunXXX_SUFFIX_XXX             \
@@ -31,14 +30,17 @@ gromppXXX_SUFFIX_XXX_SOURCES = \
         sorting.h       topdirs.h       toppush.h       dum_parm.h      \
         readir.h        topcat.h        topexcl.h       topshake.h
  
+
  mdrunXXX_SUFFIX_XXX_SOURCES = \
         mdrun.c
  
+
  tpbconvXXX_SUFFIX_XXX_SOURCES = \
         readir.c        toputil.c       topdirs.c       add_par.c       \
         topexcl.c       tpbconv.c       add_par.h       toputil.h       \
         topdirs.h       readir.h        topexcl.h
  
+
  pdb2gmxXXX_SUFFIX_XXX_SOURCES = \
         hizzie.c        xlate.c         specbond.c      ter_db.c        \
         h_db.c          genhydro.c      pdb2top.c       gen_ad.c        \
@@ -50,6 +52,7 @@ pdb2gmxXXX_SUFFIX_XXX_SOURCES = \
         topdirs.h       genhydro.h      hizzie.h        specbond.h      \
         topexcl.h
  
+
  protonateXXX_SUFFIX_XXX_SOURCES = \
         hackblock.c     ter_db.c        h_db.c          genhydro.c      \
         pgutil.c        resall.c        add_par.c       topexcl.c       \
@@ -57,18 +60,23 @@ protonateXXX_SUFFIX_XXX_SOURCES = \
         h_db.h          resall.h        toputil.h       hackblock.h     \
         pgutil.h        ter_db.h        topdirs.h       topexcl.h
  
+
  nmrunXXX_SUFFIX_XXX_SOURCES = \
         nmrun.c
  
+
  luckXXX_SUFFIX_XXX_SOURCES = \
         luck.c
  
+
  gmxdumpXXX_SUFFIX_XXX_SOURCES = \
         gmxdump.c
  
+
  gmxcheckXXX_SUFFIX_XXX_SOURCES = \
         gmxcheck.c      tpbcmp.c        tpbcmp.h
  
+
  x2topXXX_SUFFIX_XXX_SOURCES = \
         toppush.c       nm2type.c       pdb2top.c       gen_ad.c        \
         gen_dum.c       pgutil.c        resall.c        hackblock.c     \
@@ -78,11 +86,13 @@ x2topXXX_SUFFIX_XXX_SOURCES = \
         pgutil.h        topdirs.h       toppush.h       topexcl.h       \
         x2top.h
  
+
  xmdrunXXX_SUFFIX_XXX_SOURCES = \
         glaasje.c       glaasje.h       gctio.c         init_sh.c       \
         init_sh.h       ionize.c        ionize.h        ion_data.h      \
         xmdrun.c        do_gct.c        do_gct.h        relax_sh.c
  
+
  install-mdrun: mdrunXXX_SUFFIX_XXX
         $(mkinstalldirs) $(DESTDIR)$(bindir)
         if test -f mdrunXXX_SUFFIX_XXX; then \
@@ -90,6 +100,8 @@ install-mdrun: mdrunXXX_SUFFIX_XXX
              $(INSTALL_PROGRAM) $(INSTALL_STRIP_FLAG) mdrunXXX_SUFFIX_XXX $(DESTDIR)$(bindir)/mdrunXXX_SUFFIX_XXX; \
         fi
  
+
+
  # clean things explicitly, since the target names might have changed
  
  CLEANFILES   =         ${bin_PROGRAMS} ${EXTRA_PROGRAMS}       \
diff --git a/src/mdlib/Makefile.am b/src/mdlib/Makefile.am

index f67f13bb9d093576780022084e07229476203595..03fa1a6a844f4275dc654787f426a5d2e6109f03 100644 (file)
--- a/src/mdlib/Makefile.am
+++ b/src/mdlib/Makefile.am
@@ -9,9 +9,17 @@ include $(srcdir)/../Makefile.inc
  # but it can probably be done in a nicer way...
  INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
  
-lib_LIBRARIES = libmdXXX_SUFFIX_XXX.a
+# Dependencies and extra objects should come before the target definition
+libmdXXX_SUFFIX_XXX_la_LIBADD         = @MDLIB_COND_OBJ@       
+libmdXXX_SUFFIX_XXX_la_DEPENDENCIES   = @MDLIB_COND_OBJ@       
  
-libmdXXX_SUFFIX_XXX_a_SOURCES = \
+
+
+# Finally, the library definition
+
+lib_LTLIBRARIES = libmdXXX_SUFFIX_XXX.la
+
+libmdXXX_SUFFIX_XXX_la_SOURCES = \
         calcmu.c        calcvir.c                       \
         congrad.c       constr.c        coupling.c      \
         dummies.c       ebin.c          edsam.c         \
@@ -28,23 +36,11 @@ libmdXXX_SUFFIX_XXX_a_SOURCES = \
         vcm.c           wnblist.c       poisson.h       \
         splittop.h      wnblist.h
  
-EXTRA_libmdXXX_SUFFIX_XXX_a_SOURCES = \
+EXTRA_libmdXXX_SUFFIX_XXX_la_SOURCES = \
         cshake.c        csettle.c       clincs.c        \
         fshake.f        fsettle.f       flincs.f        \
         fshaked.f       fsettled.f      flincsd.f
  
-if USE_DOUBLE
-  inner_f77_obj        = fshaked.o     fsettled.o      flincsd.o
-else
-  inner_f77_obj        = fshake.o      fsettle.o       flincs.o
-endif
-
-inner_c_obj    = cshake.o      csettle.o       clincs.o
-
-
-libmdXXX_SUFFIX_XXX_a_LIBADD       =   @INNER_F77_OBJ@         @INNER_C_OBJ@
-libmdXXX_SUFFIX_XXX_a_DEPENDENCIES =   @INNER_F77_OBJ@         @INNER_C_OBJ@
-
  # clean things explicitly, since the target names might have changed
  CLEANFILES     = ${lib_LIBRARIES} *_d.a *_mpi.a *~ \\\#*
  
diff --git a/src/ngmx/Makefile.am b/src/ngmx/Makefile.am

index cec3f78b91a8ca4f295d3fb9eea98896f832d64d..590cfd63d18be8851e0645a65b7ef33c8493c4ab 100644 (file)
--- a/src/ngmx/Makefile.am
+++ b/src/ngmx/Makefile.am
@@ -9,7 +9,7 @@ include $(srcdir)/../Makefile.inc
  # but it can probably be done in a nicer way...
  INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
  LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib 
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
  
  # Ngmx requires X - nothing is built if it doesn't exist
  
@@ -55,4 +55,4 @@ endif
  
  # clean things explicitly, since the target names might have changed
  CLEANFILES   =         ${bin_PROGRAMS} ${EXTRA_PROGRAMS}       \
-               *_mpi *_d *~ \\\#*
-\ No newline at end of file
+               *_mpi *_d *~ \\\#*
diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am

index d414984d3085360d25d2c73cf538290db024fc53..98ba9ab05824955369adde3b6d130440f628003e 100644 (file)
--- a/src/tools/Makefile.am
+++ b/src/tools/Makefile.am
@@ -8,7 +8,7 @@ include $(srcdir)/../Makefile.inc
  # but it can probably be done in a nicer way...
  INCLUDES = @INCLUDES@ -I$(top_srcdir)/src/include
  LDFLAGS = @LDFLAGS@ -L${top_builddir}/src/gmxlib -L${top_builddir}/src/mdlib 
-LDADD = -lmdXXX_SUFFIX_XXX -lgmxXXX_SUFFIX_XXX
+LDADD = ../gmxlib/libgmxXXX_SUFFIX_XXX.la ../mdlib/libmdXXX_SUFFIX_XXX.la
  
  bin_PROGRAMS = \
         averageXXX_SUFFIX_XXX           do_dsspXXX_SUFFIX_XXX           \
author	lindahl <lindahl>
	Wed, 20 Jun 2001 10:33:59 +0000 (10:33 +0000)
committer	lindahl <lindahl>
	Wed, 20 Jun 2001 10:33:59 +0000 (10:33 +0000)
.cvsignore		patch \| blob \| history
Makefile.am		patch \| blob \| history
acinclude.m4		patch \| blob \| history
aclocal.m4		patch \| blob \| history
config/config.guess		patch \| blob \| history
config/ltcf-c.sh	[new file with mode: 0644]	patch \| blob
config/ltcf-f77.sh	[new file with mode: 0644]	patch \| blob
config/ltconfig	[new file with mode: 0755]	patch \| blob
config/ltmain.sh	[new file with mode: 0644]	patch \| blob
configure		patch \| blob \| history
configure.in		patch \| blob \| history
include/config.h.in		patch \| blob \| history
include/copyrite.h		patch \| blob \| history
man/.cvsignore	[new file with mode: 0644]	patch \| blob
man/Makefile.am	[new file with mode: 0644]	patch \| blob
share/html/gif/gmxlogo_small.jpg	[new file with mode: 0644]	patch \| blob
share/html/online.html		patch \| blob \| history
share/html/online/do_dssp.html		patch \| blob \| history
share/html/online/editconf.html		patch \| blob \| history
share/html/online/eneconv.html		patch \| blob \| history
share/html/online/g_anaeig.html		patch \| blob \| history
share/html/online/g_analyze.html		patch \| blob \| history
share/html/online/g_angle.html		patch \| blob \| history
share/html/online/g_bond.html		patch \| blob \| history
share/html/online/g_bundle.html		patch \| blob \| history
share/html/online/g_chi.html		patch \| blob \| history
share/html/online/g_cluster.html		patch \| blob \| history
share/html/online/g_confrms.html		patch \| blob \| history
share/html/online/g_covar.html		patch \| blob \| history
share/html/online/g_density.html		patch \| blob \| history
share/html/online/g_dielectric.html		patch \| blob \| history
share/html/online/g_dih.html		patch \| blob \| history
share/html/online/g_dipoles.html		patch \| blob \| history
share/html/online/g_disre.html		patch \| blob \| history
share/html/online/g_dist.html		patch \| blob \| history
share/html/online/g_dyndom.html		patch \| blob \| history
share/html/online/g_enemat.html		patch \| blob \| history
share/html/online/g_energy.html		patch \| blob \| history
share/html/online/g_gyrate.html		patch \| blob \| history
share/html/online/g_h2order.html		patch \| blob \| history
share/html/online/g_hbond.html		patch \| blob \| history
share/html/online/g_helix.html		patch \| blob \| history
share/html/online/g_lie.html		patch \| blob \| history
share/html/online/g_mdmat.html		patch \| blob \| history
share/html/online/g_mindist.html		patch \| blob \| history
share/html/online/g_morph.html		patch \| blob \| history
share/html/online/g_msd.html		patch \| blob \| history
share/html/online/g_nmeig.html		patch \| blob \| history
share/html/online/g_nmens.html		patch \| blob \| history
share/html/online/g_order.html		patch \| blob \| history
share/html/online/g_potential.html		patch \| blob \| history
share/html/online/g_rama.html		patch \| blob \| history
share/html/online/g_rdf.html		patch \| blob \| history
share/html/online/g_rms.html		patch \| blob \| history
share/html/online/g_rmsdist.html		patch \| blob \| history
share/html/online/g_rmsf.html		patch \| blob \| history
share/html/online/g_rotacf.html		patch \| blob \| history
share/html/online/g_saltbr.html		patch \| blob \| history
share/html/online/g_sas.html		patch \| blob \| history
share/html/online/g_sgangle.html		patch \| blob \| history
share/html/online/g_sorient.html		patch \| blob \| history
share/html/online/g_tcaf.html		patch \| blob \| history
share/html/online/g_traj.html		patch \| blob \| history
share/html/online/g_velacc.html		patch \| blob \| history
share/html/online/genbox.html		patch \| blob \| history
share/html/online/genconf.html		patch \| blob \| history
share/html/online/genion.html		patch \| blob \| history
share/html/online/genpr.html		patch \| blob \| history
share/html/online/gmxcheck.html		patch \| blob \| history
share/html/online/gmxdump.html		patch \| blob \| history
share/html/online/grompp.html		patch \| blob \| history
share/html/online/highway.html		patch \| blob \| history
share/html/online/make_ndx.html		patch \| blob \| history
share/html/online/mdrun.html		patch \| blob \| history
share/html/online/mk_angndx.html		patch \| blob \| history
share/html/online/ngmx.html		patch \| blob \| history
share/html/online/nmrun.html		patch \| blob \| history
share/html/online/pdb2gmx.html		patch \| blob \| history
share/html/online/protonate.html		patch \| blob \| history
share/html/online/style.css		patch \| blob \| history
share/html/online/tpbconv.html		patch \| blob \| history
share/html/online/trjcat.html		patch \| blob \| history
share/html/online/trjconv.html		patch \| blob \| history
share/html/online/trjorder.html		patch \| blob \| history
share/html/online/wheel.html		patch \| blob \| history
share/html/online/x2top.html		patch \| blob \| history
share/html/online/xmdrun.html	[new file with mode: 0644]	patch \| blob
share/html/online/xpm2ps.html		patch \| blob \| history
share/html/online/xrama.html		patch \| blob \| history
src/Makefile.inc		patch \| blob \| history
src/contrib/Makefile.am		patch \| blob \| history
src/contrib/gromacs-3.0.spec	[new file with mode: 0644]	patch \| blob
src/contrib/programs.txt		patch \| blob \| history
src/contrib/scripts/Makefile.am		patch \| blob \| history
src/contrib/scripts/mkhtml		patch \| blob \| history
src/gmxlib/Makefile.am		patch \| blob \| history
src/gmxlib/axp_asm.s	[moved from src/gmxlib/axp_asm.S with 100% similarity]	patch \| blob \| history
src/gmxlib/bfunc.h		patch \| blob \| history
src/gmxlib/copyrite.c		patch \| blob \| history
src/gmxlib/fnbf.c		patch \| blob \| history
src/gmxlib/mkinl.h		patch \| blob \| history
src/gmxlib/wman.c		patch \| blob \| history
src/gmxlib/x86_3dnow.S	[new file with mode: 0644]	patch \| blob
src/gmxlib/x86_3dnow.asm	[deleted file]	patch \| blob \| history
src/gmxlib/x86_cpuid.S	[new file with mode: 0644]	patch \| blob
src/gmxlib/x86_cpuid.asm	[deleted file]	patch \| blob \| history
src/gmxlib/x86_sse.S	[new file with mode: 0644]	patch \| blob
src/gmxlib/x86_sse.asm	[deleted file]	patch \| blob \| history
src/kernel/Makefile.am		patch \| blob \| history
src/mdlib/Makefile.am		patch \| blob \| history
src/ngmx/Makefile.am		patch \| blob \| history
src/tools/Makefile.am		patch \| blob \| history