Merge branch 'rotation-4-5' into rotation
authorCarsten Kutzner <ckutzne@gwdg.de>
Thu, 4 Nov 2010 10:51:56 +0000 (11:51 +0100)
committerCarsten Kutzner <ckutzne@gwdg.de>
Thu, 4 Nov 2010 10:51:56 +0000 (11:51 +0100)
Conflicts:
src/tools/Makefile.am

70 files changed:
CMakeLists.txt
cmake/FindMPI.cmake
cmake/TestMPI.c [new file with mode: 0644]
cmake/Toolchain-BlueGeneL-xlc.cmake [new file with mode: 0644]
configure.ac
include/CMakeLists.txt
include/gmx_sse2_double.h
include/pdb2top.h
include/pull_rotation.h
include/string2.h
include/types/forcerec.h
include/types/nblist.h
include/vec.h
man/CMakeLists.txt
scripts/CMakeLists.txt
share/CMakeLists.txt
share/html/online/mdp_opt.html
share/top/charmm27.ff/aminoacids.r2b
share/top/charmm27.ff/gb.itp
share/top/gromos43a1.ff/aminoacids.r2b
share/top/gromos43a1.ff/methanol.itp
share/top/gromos43a2.ff/aminoacids.r2b
share/top/gromos45a3.ff/aminoacids.r2b
share/top/gromos53a5.ff/aminoacids.r2b
share/top/gromos53a6.ff/aminoacids.r2b
src/config.h.cmakein
src/gmxlib/CMakeLists.txt
src/gmxlib/copyrite.c
src/gmxlib/enxio.c
src/gmxlib/gmxfio.c
src/gmxlib/nonbonded/nb_kernel_bluegene/interaction.h
src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel_allvsallgb_sse2_double.c
src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel400_sse2_double.c
src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel410_sse2_double.c
src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel430_sse2_double.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel400_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel410_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel430_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.c
src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.c
src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.c
src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel_allvsallgb_sse2_double.c
src/kernel/CMakeLists.txt
src/kernel/grompp.c
src/kernel/openmm_wrapper.cpp
src/kernel/pdb2gmx.c
src/kernel/pdb2top.c
src/mdlib/CMakeLists.txt
src/mdlib/clincs.c
src/mdlib/coupling.c
src/mdlib/domdec.c
src/mdlib/ebin.c
src/mdlib/genborn.c
src/mdlib/genborn_allvsall.c
src/mdlib/genborn_sse2_double.c
src/mdlib/genborn_sse2_single.c
src/mdlib/gmx_fft_fftw3.c
src/mdlib/gmx_wallcycle.c
src/mdlib/mdebin.c
src/mdlib/pme.c
src/mdlib/pull_rotation.c
src/ngmx/CMakeLists.txt
src/tools/CMakeLists.txt
src/tools/gmx_bar.c
src/tools/gmx_hbond.c
src/tools/gmx_potential.c
src/tools/make_ndx.c

index bee6e3f585233e682d5c263a6f2ea5e8f4bb1094..0d49a828f7584c5d3a90a3d5747161c2fea02e68 100644 (file)
@@ -3,9 +3,16 @@ cmake_minimum_required(VERSION 2.6)
 project(Gromacs)
 include(Dart)
 mark_as_advanced(DART_ROOT)
+
 # PROJECT_VERSION should have the following structure: 
-# VERSION[-dev-SUFFIX] where the VERSION can have any form and the suffix
-set(PROJECT_VERSION "4.5.1")
+# VERSION-dev[-SUFFIX] where the VERSION should have the for: vMajor.vMinor.vPatch
+#
+# The "-dev" suffix is important to keep because it makes possible to distinguish 
+# between a build from official release and a build from git release branch on a 
+# machine with no git. 
+#
+# NOTE: when releasing the "-dev" suffix needs to be stripped off!
+set(PROJECT_VERSION "4.5.2-dev")
 set(CUSTOM_VERSION_STRING ""
     CACHE STRING "Custom version string (if empty, use hard-coded default)")
 mark_as_advanced(CUSTOM_VERSION_STRING)
@@ -16,7 +23,7 @@ set(SOVERSION 6)
 # It is a bit irritating, but this has to be set separately for now!
 SET(CPACK_PACKAGE_VERSION_MAJOR "4")
 SET(CPACK_PACKAGE_VERSION_MINOR "5")
-SET(CPACK_PACKAGE_VERSION_PATCH "1")
+SET(CPACK_PACKAGE_VERSION_PATCH "2")
 
 
 # Cmake modules/macros are in a subdirectory to keep this file cleaner
@@ -81,6 +88,8 @@ option(GMX_MPI    "Build a parallel (message-passing) version of GROMACS" OFF)
 option(GMX_THREADS    "Build a parallel (thread-based) version of GROMACS (cannot be combined with MPI yet)" ON)
 option(GMX_SOFTWARE_INVSQRT "Use GROMACS software 1/sqrt" ON)
 mark_as_advanced(GMX_SOFTWARE_INVSQRT)
+option(GMX_POWERPC_INVSQRT "Use PowerPC hardware 1/sqrt" ON)
+mark_as_advanced(GMX_POWERPC_INVSQRT)
 option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
 mark_as_advanced(GMX_FAHCORE)
 option(GMX_OPENMM "Accelerated execution on GPUs through the OpenMM library (rerun cmake after changing to see relevant options)" OFF)
@@ -113,6 +122,11 @@ mark_as_advanced(USE_VERSION_H)
 
 option(GMX_DEFAULT_SUFFIX "Use default suffixes for GROMACS binaries and libs (_d for double, _mpi for MPI; rerun cmake after changing to see relevant options)" ON)
 
+if(UNIX AND NOT APPLE)
+    option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer \".a\" static archives (NOTE: this is enabled only for UNIX (excluding APPLE) platforms but it might not always work!" OFF)
+    mark_as_advanced(GMX_PREFER_STATIC_LIBS)
+endif()
+
 ########################################################################
 # Set up binary and library suffixing 
 ########################################################################
@@ -150,6 +164,9 @@ endif(GMX_DOUBLE)
 if(GMX_SOFTWARE_INVSQRT)
   set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_SOFTWARE_INVSQRT")
 endif(GMX_SOFTWARE_INVSQRT)
+if(GMX_POWERPC_INVSQRT)
+  set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_POWERPC_INVSQRT")
+endif(GMX_POWERPC_INVSQRT)
 
 ########################################################################
 #Process MPI settings
@@ -341,6 +358,15 @@ test_big_endian(GMX_INTEGER_BIG_ENDIAN)
 ########################################################################
 # Find external packages                                               #
 ########################################################################
+if(UNIX AND NOT APPLE)
+    if(GMX_PREFER_STATIC_LIBS)
+        SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+        if(BUILD_SHARED_LIBS)
+            message(WARNING "Static libraries requested, the GROMACS libraries will also be build static (BUILD_SHARED_LIBS=OFF)")
+            set(BUILD_SHARED_LIBS OFF CACHE BOOL "Enable shared libraries (can be problematic with MPI, Windows)" FORCE)
+        endif()
+    endif()
+endif()
 
 find_package(LibXml2)
 set(PKG_XML "")
@@ -467,7 +493,6 @@ if (${GMX_ACCELERATION} STREQUAL "auto" AND NOT GMX_OPENMM)
   endif()
 endif (${GMX_ACCELERATION} STREQUAL "auto" AND NOT GMX_OPENMM)
 
-
 include(gmxTestXDR)
 gmx_test_xdr(GMX_SYSTEM_XDR)
 if(NOT GMX_SYSTEM_XDR)
@@ -527,9 +552,25 @@ elseif(${GMX_ACCELERATION} STREQUAL "FORTRAN")
     set(GMX_IA32_ASM 0)
     set(GMX_GMX_X86_64_ASM 0)
 elseif(${GMX_ACCELERATION} STREQUAL "BLUEGENE")
+# GMX_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
+    message(STATUS "Configuring for BlueGene")
     set(GMX_BLUEGENE 1)
+    if (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
+        set(SHARED_LIBS_DEFAULT OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
+        set(BUILD_SHARED_LIBS OFF CACHE BOOL "Shared libraries not compatible with BlueGene/L, disabled!" FORCE)
+    endif (${CMAKE_SYSTEM_NAME} STREQUAL "BlueGeneL")
+    set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on BlueGene" FORCE)
+    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on BlueGene" FORCE)
+    set(GMX_X11 OFF CACHE BOOL "X11 not compatible with BlueGene, disabled!" FORCE)
+    set(GMX_THREADS OFF CACHE BOOL "Threads not compatible with BlueGene, disabled!" FORCE)
+    set(GMX_MPI ON CACHE BOOL "Use MPI on BlueGene" FORCE)
+    set(GMX_EXTERNAL_BLAS TRUE CACHE BOOL "Use MASSV for BLAS on BlueGene" FORCE)
+    set(GMX_EXTERNAL_LAPACK TRUE CACHE BOOL "Use MASSV for LAPACK on BlueGene" FORCE)
+    list(APPEND GMX_EXTRA_LIBRARIES massv)
 elseif(${GMX_ACCELERATION} STREQUAL "POWER6")
     set(GMX_POWER6 1)
+    set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Do not use software reciprocal square root on Power6" FORCE)
+    set(GMX_POWERPC_INVSQRT ON CACHE BOOL "Use hardware reciprocal square root on Power6" FORCE)
 elseif(${GMX_ACCELERATION} STREQUAL "IA64")
     set(GMX_IA64_ASM 1)
     set(DISABLE_WATERWATER_NLIST 1)
index 481b0e994dfa5d6c73c3a888444c90637a869a19..a0c36ed77af5dd34443a3886296d45098bffda5c 100644 (file)
 # Microsoft HPC SDK is automatically added to the system path
 # Argonne National Labs MPICH2 sets a registry key that we can use.
 
+TRY_COMPILE(MPI_FOUND ${CMAKE_BINARY_DIR}
+  "${CMAKE_SOURCE_DIR}/cmake/TestMPI.c"
+  COMPILE_DEFINITIONS )
+
+if(MPI_FOUND)
+  return()
+endif()
+
 set(_MPI_PACKAGE_DIR
   mpi
   mpich
diff --git a/cmake/TestMPI.c b/cmake/TestMPI.c
new file mode 100644 (file)
index 0000000..4eab044
--- /dev/null
@@ -0,0 +1,6 @@
+#include <mpi.h> 
+
+int main(int argc, char **argv)
+{
+  MPI_Init(&argc,&argv);
+}
diff --git a/cmake/Toolchain-BlueGeneL-xlc.cmake b/cmake/Toolchain-BlueGeneL-xlc.cmake
new file mode 100644 (file)
index 0000000..5e0eadc
--- /dev/null
@@ -0,0 +1,51 @@
+# derived from http://cmake.org/Wiki/CmakeBlueGene
+
+# the name of the target operating system
+set(CMAKE_SYSTEM_NAME BlueGeneL CACHE STRING "Cross-compiling for BlueGene/L")
+
+# adjust to suit your machine's versions
+#    /bgl/BlueLight/V1R3M2_140_2007-070424/ppc/bglsys
+set(BLRTS_PATH /bgl/BlueLight/V1R3M4_300_2008-080728/ppc/bglsys CACHE STRING "Path to the BlueGene/L system libraries and includes")
+
+# set the compiler
+set(CMAKE_C_COMPILER  /opt/ibmcmp/vac/bg/8.0/bin/blrts_xlc)
+set(CMAKE_C_FLAGS "-O3 -qbgl -qarch=auto -qtune=auto -qnoautoconfig -qfloat=norngchk -qhot")
+set(CMAKE_EXE_LINKER_FLAGS "-L${BLRTS_PATH}/lib")
+set(CMAKE_CXX_COMPILER  /opt/ibmcmp/vacpp/bg/8.0/bin/blrts_xlC)
+
+set(MPI_LIBRARY mpich.rts CACHE STRING "MPI library for BlueGene" FORCE)
+set(MPI_EXTRA_LIBRARY msglayer.rts devices.rts rts.rts devices.rts CACHE STRING "Extra MPI libraries for BlueGene" FORCE)
+set(MPI_INCLUDE_PATH ${BLRTS_PATH}/include  CACHE STRING "MPI include path for BlueGene" FORCE)
+
+# This adds directories that find commands should specifically ignore for cross compiles.
+# Most of these directories are the includeand lib directories for the frontend on BG/P systems.
+# Not ignoring these can cause things like FindX11 to find a frontend PPC version mistakenly.
+# We use this on BG instead of re-rooting because backend libraries are typically strewn about
+# the filesystem, and we can't re-root ALL backend libraries to a single place.
+
+set(CMAKE_SYSTEM_IGNORE_PATH
+  /lib             /lib64             /include
+  /usr/lib         /usr/lib64         /usr/include
+  /usr/local/lib   /usr/local/lib64   /usr/local/include
+  /usr/X11/lib     /usr/X11/lib64     /usr/X11/include
+  /usr/lib/X11     /usr/lib64/X11     /usr/include/X11
+  /usr/X11R6/lib   /usr/X11R6/lib64   /usr/X11R6/include
+  /usr/X11R7/lib   /usr/X11R7/lib64   /usr/X11R7/include
+)
+
+# set the search path for the environment coming with the compiler
+# and a directory where you can install your own compiled software
+set(CMAKE_FIND_ROOT_PATH
+    /bgl/BlueLight/ppcfloor/
+    ${BLRTS_PATH}
+    /opt/ibmcmp/xlmass/bg
+)
+
+# adjust the default behaviour of the FIND_XXX() commands:
+# search headers and libraries in the target environment, search 
+# programs in the host environment
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(GMX_ACCELERATION "BlueGene" CACHE STRING "Forcing BlueGene acceleration when using BlueGene toolchain")
index ebdad8f4c779b6c3bedff127896fb1c62319129c..f9b9ef1d5b45b70b598829e8958dbaf9c86d087f 100644 (file)
@@ -3,7 +3,13 @@
 #######################################################################
  
 AC_PREREQ(2.50)
-AC_INIT(gromacs, 4.5.1, [gmx-users@gromacs.org])
+
+# The "-dev" suffix is important to keep because it makes possible to distinguish 
+# between a build from official release and a build from git release branch on a 
+# machine with no git. 
+#
+# NOTE: when releasing the "-dev" suffix needs to be stripped off!
+AC_INIT(gromacs, 4.5.2-dev, [gmx-users@gromacs.org])
 AC_CONFIG_SRCDIR(src/gmxlib/3dview.c)
 AC_CONFIG_AUX_DIR(config)
 AC_CANONICAL_HOST
@@ -153,9 +159,9 @@ esac
 # IBM Power6-specific optimization
 AC_ARG_ENABLE(power6,
              [AC_HELP_STRING([--enable-power6],
-                             [Use IBM Pwr6/PPC440/PPC450-specific F77 kernels])],,enable_power6=no)
+                             [Use IBM Power6-specific F77 kernels])],,enable_power6=no)
 if test "$enable_power6" = "yes"; then
-  AC_DEFINE(GMX_POWER6,,[Enable IBM Pwr6/PPC440/PPC450-specific F77 kernels])
+  AC_DEFINE(GMX_POWER6,,[Enable IBM Power6-specific F77 kernels])
 fi
 
 AC_ARG_ENABLE(bluegene,
@@ -174,11 +180,11 @@ AC_ARG_ENABLE(software-invsqrt,
               [AC_HELP_STRING([--disable-software-invsqrt],
                               [No software 1/sqrt (disabled on sgi,ibm,ia64)])],,
 [case "${host_cpu}-${host_os}" in
-  mips*-irix* | rs6000*-aix* | powerpc*-aix | ia64*-*) enable_software_invsqrt=no ;;
+  mips*-irix* | rs6000*-aix* | powerpc*-aix | powerpc*-none | ia64*-*) enable_software_invsqrt=no ;;
   *) enable_software_invsqrt=yes ;;
 esac])
 if test "$enable_software_invsqrt" = "yes"; then
-  AC_DEFINE(GMX_SOFTWARE_INVSQRT,,[Use the GROMACS sGMX_INTERNAL_XDRsqrt(x)])
+  AC_DEFINE(GMX_SOFTWARE_INVSQRT,,[Use the GROMACS software 1/sqrt(x)])
   PKG_CFLAGS="$PKG_CFLAGS -DGMX_SOFTWARE_INVSQRT"
 fi
 AM_CONDITIONAL([GMX_SOFTWARE_INVSQRT],[test "$enable_software_invsqrt" = "yes"])
@@ -1126,6 +1132,8 @@ fi
 
 if test "$enable_bluegene" = "yes"; then
   AC_DEFINE(GMX_BLUEGENE,,[Use assembly intrinsics kernels for BlueGene])
+  AC_DEFINE_UNQUOTED(GMX_POWERPC_INVSQRT,,[Use the PowerPC hardware 1/sqrt(x)])
+  PKG_CFLAGS="$PKG_CFLAGS -DGMX_POWERPC_INVSQRT"
 fi
 
 if test "$enable_fortran" = "yes"; then
index c0248b9c2cf257c181499e4a4b480953da3b21ad..05b41fd8366783345984428c0d0b928a2e767cdc 100644 (file)
@@ -1,5 +1,6 @@
 # includes: Nothing to build, just installation
 install(DIRECTORY . DESTINATION ${INCL_INSTALL_DIR}/gromacs
+  COMPONENT development
   PATTERN "Makefile*" EXCLUDE
   PATTERN "CMake*" EXCLUDE
   PATTERN "cmake*" EXCLUDE
index cf9ee16c7805bac5d9504b43eaae0d256e73fbd7..d6635af150dd37dec874ce96c189e44c363cbb84 100644 (file)
@@ -476,7 +476,7 @@ gmx_mm_sincos_pd(__m128d x,
     };
 #endif
     
-    const __m128d signmask    = _mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
+    const __m128d signmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
     const __m128d tabscale    = _mm_set1_pd(32.0/M_PI);
     const __m128d invtabscale = _mm_set1_pd(M_PI/32.0);
     const __m128d one         = _mm_set1_pd(1.0);
@@ -533,10 +533,10 @@ gmx_mm_sincos_pd(__m128d x,
     cswapsign = _mm_shuffle_epi32(cswapsign,_MM_SHUFFLE(1,1,0,0));
     minusone  = _mm_sub_pd(_mm_setzero_pd(),one);
     
-    ssign     = _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(sswapsign),minusone ),
-                          _mm_andnot_pd( _mm_castsi128_pd(sswapsign),one ));
-    csign     = _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(cswapsign),minusone ),
-                          _mm_andnot_pd( _mm_castsi128_pd(cswapsign),one ));
+    ssign     = _mm_or_pd(_mm_and_pd( gmx_mm_castsi128_pd(sswapsign),minusone ),
+                          _mm_andnot_pd( gmx_mm_castsi128_pd(sswapsign),one ));
+    csign     = _mm_or_pd(_mm_and_pd( gmx_mm_castsi128_pd(cswapsign),minusone ),
+                          _mm_andnot_pd( gmx_mm_castsi128_pd(cswapsign),one ));
     
     /* First lookup into table */
 #ifdef _MSC_VER
index 0fc4ab5d197a6ecd9ad2e7555e79589c400f0bc4..a58f6199faa5c90e8e6fc063551837b30b24c5a5 100644 (file)
@@ -107,7 +107,7 @@ void pdb2top(FILE *top_file, char *posre_fn, char *molname,
                    int nrtp, t_restp rtp[],
                    t_restp *restp, t_hackblock *hb,
                    int nterpairs, t_hackblock **ntdb, t_hackblock **ctdb,
-                   int *rn, int *rc, gmx_bool bAllowMissing,
+                   gmx_bool bAllowMissing,
                    gmx_bool bVsites, gmx_bool bVsiteAromatics,
                    const char *ff, const char *ffdir,
                    real mHmult,
index df6ed1ffd1aaafd997c1e3ba20660f1b557c0b2a..403d571c4239c3d5e4ad67d11162bd3c57873a82 100644 (file)
@@ -79,7 +79,17 @@ extern void init_rot(FILE *fplog,t_inputrec *ir,int nfile,const t_filenm fnm[],
         t_commrec *cr, rvec *x, matrix box, gmx_mtop_t *mtop, const output_env_t oenv,
         gmx_bool bVerbose, unsigned long Flags);
 
-extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot,t_mdatoms *md);
+
+/*! \brief Make a selection of the home atoms for all enforced rotation groups.
+ *
+ * This routine is similar to dd_make_local_pull_groups, but works only with
+ * domain decomposition. It should be called at every domain decomposition.
+ *
+ * \param dd                Structure containing domain decomposition data.
+ * \param rot               Pointer to all the enforced rotation data.
+ */
+extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot);
+
 
 /*! \brief Calculation of the enforced rotation potential.
  * 
index 95cd558880b6eb05d9d298c92c125e9b80a714da..0f26e3ffbc4973a362fe3db94d424b1e2855a75a 100644 (file)
@@ -123,6 +123,10 @@ char **split(char sep,const char *str);
 
 gmx_large_int_t str_to_large_int_t(const char *str, char **endptr);
 
+#if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
+#define snprintf _snprintf
+#endif
+
 #ifdef __cplusplus
 }
 #endif
index 405b503bd0a7b39ee62af37ddc399387ef3ce914..70b9e0d2ff02272c94a0f5bbaed92171592befea 100644 (file)
@@ -68,10 +68,12 @@ typedef struct {
 } t_nblists;
 
 /* macros for the cginfo data in forcerec */
-/* The maximum cg size is 255, because we only have space for 8 bits in cginfo,
+/* The maximum cg size in cginfo is 255,
+ * because we only have space for 8 bits in cginfo,
  * this cg size entry is actually only read with domain decomposition.
+ * But there is a smaller limit due to the t_excl data structure
+ * which is defined in nblist.h.
  */
-#define MAX_CHARGEGROUP_SIZE 256
 #define SET_CGINFO_GID(cgi,gid)      (cgi) = (((cgi)  &  ~65535)  |  (gid)   )
 #define GET_CGINFO_GID(cgi)        ( (cgi)            &   65535)
 #define SET_CGINFO_EXCL_INTRA(cgi)   (cgi) =  ((cgi)  |  (1<<16))
index 843435e001bd16e9c11b7d96ca6e3c68e35ea5d0..ae5aea1853383ac0d009469cb3e8842694d8ea22 100644 (file)
@@ -50,6 +50,11 @@ enum {
 
 typedef unsigned long t_excl;
 
+/* The maximum charge group size because of minimum size of t_excl
+ * could be 32 bits.
+ */
+#define MAX_CHARGEGROUP_SIZE 32
+
 /* The maximum charge group size for CG-CG nblists.
  * The excl entry in t_nblist uses blocks of this size.
  */
index a2b0c099d4a5600b8799da325b467556048d8016..10d221c259afa7a8c11de1020cbe6a07a01c7ce3 100644 (file)
@@ -785,7 +785,7 @@ static gmx_inline void mvmul(matrix a,const rvec src,rvec dest)
 static gmx_inline void mvmul_ur0(matrix a,const rvec src,rvec dest)
 {
   dest[ZZ]=a[ZZ][XX]*src[XX]+a[ZZ][YY]*src[YY]+a[ZZ][ZZ]*src[ZZ];
-  dest[YY]=a[YY][XX]*src[XX]+a[YY][YY];
+  dest[YY]=a[YY][XX]*src[XX]+a[YY][YY]*src[YY];
   dest[XX]=a[XX][XX]*src[XX];
 }
 
index 18cf1d1c47bffb116e7f37ebb6f4680a9e3f8fc3..82953fe841083efd853c2467e993a63dd2608863 100644 (file)
@@ -1,5 +1,6 @@
 # Man pages: Nothing to build, just installation
 install(DIRECTORY . DESTINATION ${MAN_INSTALL_DIR}
+  COMPONENT data
   PATTERN "Makefile*" EXCLUDE
   PATTERN "CMake*" EXCLUDE
   PATTERN "cmake*" EXCLUDE
index 87b51fce0dc33ec9ab3ae4114ad827f128f5ae9f..a89c958615d23f5134f0b84ee013035be0cea526 100644 (file)
@@ -3,14 +3,14 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/GMXRC.bash.cmakein ${CMAKE_CURRENT_BI
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/GMXRC.csh.cmakein  ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.csh @ONLY)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/GMXRC.zsh.cmakein  ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.zsh @ONLY)
 
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC      DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.bash DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.zsh  DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.csh  DESTINATION ${BIN_INSTALL_DIR})
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC      DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.bash DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.zsh  DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/GMXRC.csh  DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
 
 file(GLOB EXTRA_SCRIPTS completion.*)
-install(FILES ${EXTRA_SCRIPTS} DESTINATION ${BIN_INSTALL_DIR})
+install(FILES ${EXTRA_SCRIPTS} DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
 
-install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/demux.pl      DESTINATION ${BIN_INSTALL_DIR})
-install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/xplor2gmx.pl  DESTINATION ${BIN_INSTALL_DIR})
+install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/demux.pl      DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
+install(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/xplor2gmx.pl  DESTINATION ${BIN_INSTALL_DIR} COMPONENT runtime)
 
index 86f1a85517612e0c2b44243cd499f8eb32ecd4c9..dfebb7f8419c0108c7327b49f5605676f14d90d5 100644 (file)
@@ -1,9 +1,13 @@
 # Data: Nothing to build, just installation
 install(DIRECTORY . DESTINATION ${DATA_INSTALL_DIR}
+  COMPONENT data
   PATTERN "Makefile*" EXCLUDE
   PATTERN "CMake*" EXCLUDE
   PATTERN "cmake*" EXCLUDE
   PATTERN "*~" EXCLUDE
 )
 
-install(FILES template/CMakeLists.txt.template DESTINATION ${DATA_INSTALL_DIR} RENAME template/CMakeLists.txt)
+install(FILES template/CMakeLists.txt.template
+        DESTINATION ${DATA_INSTALL_DIR}
+        RENAME template/CMakeLists.txt
+        COMPONENT data)
index c132a4af150727b3c63517a6d455fb1f51e29c64..a510d3e84bf6c3d04bd4f63c96d50cd0567c2916 100644 (file)
@@ -1116,11 +1116,17 @@ there is also a wall at z=z_box. Walls can only be used with <b>pbc=xy</b>.
 When set to <b>2</b> pressure coupling and Ewald summation can be used
 (it is usually best to use semiisotropic pressure coupling with
 the x/y compressibility set to 0, as otherwise the surface area will change).
+Walls interact wit the rest of the system through an optional <tt>wall_atomtype</tt>.
 Energy groups <tt>wall0</tt> and <tt>wall1</tt> (for <b>nwall=2</b>) are
 added automatically to monitor the interaction of energy groups
 with each wall.
 The <A HREF="#run">center of mass motion removal</A> will be turned
 off in the z-direction.</dd>
+<dt><b>wall_atomtype:</b></dt>
+<dd>the atom type name in the force field for each wall. 
+By (for example) defining a special wall atom type in the topology with its 
+own combination rules, this allows for independent tuning of the interaction 
+of each atomtype with the walls.</dd>
 <dt><b>wall_type:</b></dt>
 <dl>
 <dt><b>9-3</b></dt>
@@ -1143,9 +1149,6 @@ are beyond a wall.
 When the value is &le;0 (&lt;0 for <b>wall_type=table</b>),
 a fatal error is generated when atoms are beyond a wall.
 </dd>
-<dt><b>wall_atomtype:</b></dt>
-<dd>the atom type name in the force field for each wall, this allows
-for independent tuning of the interaction of each atomtype with the walls</dd>
 <dt><b>wall_density: [nm<sup>-3</sup>/nm<sup>-2</sup>]</b></dt>
 <dd>the number density of the atoms for each wall for wall types
 <b>9-3</b> and <b>10-4</b>
index 0f4f982055ce66543c751b0039e2a891f389c053..9296d0194d9bb9d011ba1b0e0eed5584ac66f729 100644 (file)
@@ -6,3 +6,4 @@ HISH    HSP
 LYSN   LSN
 ASPH   ASPP
 GLUH   GLUP
+HEM     HEME
index a4f29e117c2fb5b4d99ef3949085394730fb4e95..bdd87ef10b710f6200255c012a3fb1a71d178e24 100644 (file)
@@ -41,6 +41,7 @@
  HP             0.1     1       1       0.125    0.85 ; H
  NY             0.155   1       1.028   0.17063  0.79 ; N
  CPT            0.172   0.012   1.554   0.1875   0.72 ; C     
- MNH3          0       0       0       0        0    ; vsite (rigid tetrahedrical NH3 group)
- MNH2          0       0       0       0        0    ; vsite
- MCH3          0       0       0       0        0    ; vsite (rigid CH3 group)
\ No newline at end of file
+ MNH3          0       0       0       0        0    ; dummy mass
+ MNH2          0       0       0       0        0    ; dummy mass
+ MCH3          0       0       0       0        0    ; dummy mass
+ MCH3S         0       0       0       0        0    ; dummy mass
index 823db9041c643335d7f9a9148250de288c7c7beb..551bb3517459c8ba361695d782347caedf9359a2 100644 (file)
@@ -5,3 +5,4 @@ HISD   HISA
 HISE   HISB
 LYS    LYSH
 LYSN   LYS
+HEM    HEME
index e31e651f8ca0ed85875849321bcd18f2afca9874..61b545b1d3c1a2041c4a962580928ce9599ad399 100644 (file)
@@ -2,9 +2,9 @@
 
 [ atomtypes ]
 ;   type      mass    charge    ptype       c6            c12
-    OMET    15.999    -0.69     A      2.6169e-3      2.5231e-6
+    OMet    15.999    -0.69     A      2.6169e-3      2.5231e-6
       OW    15.999    -0.82     A      2.6170e-3      2.6330e-6
-    CMET    15.035     0.29     A      8.8758e-3     17.8426e-6
+    CMet    15.035     0.29     A      8.8758e-3     17.8426e-6
        H     1.008     0.4      A      0.0            0.0
       HW     1.008     0.41     A      0.0            0.0
 #endif
@@ -16,12 +16,12 @@ Methanol        2
 [ atoms ]
 ;   nr  type    resnr   residu  atom    cgnr    charge mass
 #ifdef _FF_GROMOS96
-1       CMET     1       MeOH    Me1     1        0.176 15.035   
-2       OMET     1       MeOH    O2      1       -0.574 15.999 
+1       CMet     1       MeOH    Me1     1        0.176 15.035   
+2       OMet     1       MeOH    O2      1       -0.574 15.999 
 3       H        1       MeOH    H3      1        0.398  1.008 
 #else
-1       CMET     1       MeOH    Me1     1        0.29  15.035
-2       OMET     1       MeOH    O2      1       -0.69  15.999
+1       CMet     1       MeOH    Me1     1        0.29  15.035
+2       OMet     1       MeOH    O2      1       -0.69  15.999
 3       H        1       MeOH    H3      1        0.40   1.008
 #endif
 
index 823db9041c643335d7f9a9148250de288c7c7beb..551bb3517459c8ba361695d782347caedf9359a2 100644 (file)
@@ -5,3 +5,4 @@ HISD   HISA
 HISE   HISB
 LYS    LYSH
 LYSN   LYS
+HEM    HEME
index 823db9041c643335d7f9a9148250de288c7c7beb..551bb3517459c8ba361695d782347caedf9359a2 100644 (file)
@@ -5,3 +5,4 @@ HISD   HISA
 HISE   HISB
 LYS    LYSH
 LYSN   LYS
+HEM    HEME
index 823db9041c643335d7f9a9148250de288c7c7beb..551bb3517459c8ba361695d782347caedf9359a2 100644 (file)
@@ -5,3 +5,4 @@ HISD   HISA
 HISE   HISB
 LYS    LYSH
 LYSN   LYS
+HEM    HEME
index 823db9041c643335d7f9a9148250de288c7c7beb..551bb3517459c8ba361695d782347caedf9359a2 100644 (file)
@@ -5,3 +5,4 @@ HISD   HISA
 HISE   HISB
 LYS    LYSH
 LYSN   LYS
+HEM    HEME
index a8c687baa02d537c7fb29863310276fc07047d50..6b51da84d4f6af7397cb6e3220cc936a732fd0c4 100644 (file)
 /* Use the GROMACS software 1/sqrt(x) */
 #cmakedefine GMX_SOFTWARE_INVSQRT
 
+/* Use the PowerPC hardware 1/sqrt(x) */
+#cmakedefine GMX_POWERPC_INVSQRT
+
 /* Compile with dlopen */
 #cmakedefine GMX_DLOPEN
 
index 4cbe2e9bdf7bce0b70faf5b3fd03405a6ed1b492..21bdd4c4236583e43d039fbd1ca368b7b935bac4 100644 (file)
@@ -74,6 +74,10 @@ if(GMX_POWER6)
   file(GLOB FORTRAN_SOURCES nonbonded/nb_kernel_power6/*.[cF])
 endif(GMX_POWER6)
 
+if(GMX_BLUEGENE)
+  file(GLOB GMX_BLUEGENE_C_SRC nonbonded/nb_kernel_bluegene/*.c)
+endif(GMX_BLUEGENE)
+
 if(NOT GMX_EXTERNAL_BLAS)
   file(GLOB BLAS_SOURCES gmx_blas/*.c)
 endif(NOT GMX_EXTERNAL_BLAS)
@@ -130,13 +134,18 @@ else(GMX_ASM_USEASM-NASM)
 endif(GMX_ASM_USEASM-NASM)
 endif(NOT GMX_OPENMM)
 
-add_library(gmx ${GMXLIB_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES} ${GMX_SSEKERNEL_C_SRC} ${GMX_SSEKERNEL_ASM_SRC} ${FORTRAN_SOURCES} ${THREAD_MPI_SRC})
+add_library(gmx ${GMXLIB_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES} ${GMX_SSEKERNEL_C_SRC} ${GMX_SSEKERNEL_ASM_SRC} ${FORTRAN_SOURCES} ${GMX_BLUEGENE_C_SRC} ${THREAD_MPI_SRC})
 target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES}  ${THREAD_LIB})
-add_dependencies(gmx gmx_version) 
+if(USE_VERSION_H)
+       add_dependencies(gmx gmx_version) 
+endif()
 set_target_properties(gmx PROPERTIES OUTPUT_NAME "gmx${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
 
-install(TARGETS gmx DESTINATION ${LIB_INSTALL_DIR})
+install(TARGETS gmx DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmx.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmx.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmx.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig RENAME "libgmx${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmx.pc
+        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+        RENAME "libgmx${GMX_LIBS_SUFFIX}.pc"
+        COMPONENT development)
 
index 982ca55b1de85d6187c860f15979de526952005c..e9c9790f8f05c1231d26bf32a5c59a1c6052547b 100644 (file)
@@ -145,7 +145,7 @@ static void ster_print(FILE *out,const char *s)
   int  slen;
   char buf[128];
   
-  sprintf(buf,":-)  %s  (-:",s);
+  snprintf(buf,128,":-)  %s  (-:",s);
   slen=strlen(buf);
   space(out,(80-slen)/2);
   fprintf(out,"%s\n",buf);
@@ -258,7 +258,7 @@ void CopyRight(FILE *out,const char *szProgram)
 
   fprintf(out,"\n");
 
-  sprintf(buf,"%s",Program());
+  snprintf(buf,256,"%s",Program());
 #ifdef GMX_DOUBLE
   strcat(buf," (double precision)");
 #endif
index fef5f69ee58682a8ea24242c672cbf7b509a5e89..4999c6a63209316737b68947013f1f755ebb2181 100644 (file)
@@ -207,7 +207,7 @@ static void enxsubblock_alloc(t_enxsubblock *sb)
             }
             break;
         default:
-            gmx_incons("Unknown block type");
+            gmx_incons("Unknown block type: this file is corrupted or from the future");
     }
 }
 
@@ -314,6 +314,19 @@ void add_subblocks_enxblock(t_enxblock *eb, int n)
     }
 }
 
+static void enx_warning(const char *msg)
+{
+    if (getenv("GMX_ENX_NO_FATAL") != NULL)
+    {
+        gmx_warning(msg);
+    }
+    else
+    {
+        gmx_fatal(FARGS,"%s\n%s",
+                  msg,
+                  "If you want to use the correct frames before the corrupted frame and avoid this fatal error set the env.var. GMX_ENX_NO_FATAL");
+    }
+}
 
 static void edr_strings(XDR *xdr,gmx_bool bRead,int file_version,
                         int n,gmx_enxnm_t **nms)
@@ -429,7 +442,7 @@ static gmx_bool do_eheader(ener_file_t ef,int *file_version,t_enxframe *fr,
                        int nre_test,gmx_bool *bWrongPrecision,gmx_bool *bOK)
 {
     int  magic=-7777777;
-    real r;
+    real first_real_to_check;
     int  b,i,zero=0,dum=0;
     gmx_bool bRead = gmx_fio_getread(ef->fio);
     int  tempfix_nr=0;
@@ -457,16 +470,16 @@ static gmx_bool do_eheader(ener_file_t ef,int *file_version,t_enxframe *fr,
      * (which is the case for for instance the block sizes for variable
      * number of blocks, where this number is read before).
      */
-    r = -2e10;
-    if (!gmx_fio_do_real(ef->fio, r))
+    first_real_to_check = -2e10;
+    if (!gmx_fio_do_real(ef->fio, first_real_to_check))
     {
         return FALSE;
     }
-    if (r > -1e10)
+    if (first_real_to_check > -1e10)
     {
         /* Assume we are reading an old format */
         *file_version = 1;
-        fr->t = r;
+        fr->t = first_real_to_check;
         if (!gmx_fio_do_int(ef->fio, dum))   *bOK = FALSE;
         fr->step = dum;
     }
@@ -475,7 +488,9 @@ static gmx_bool do_eheader(ener_file_t ef,int *file_version,t_enxframe *fr,
         if (!gmx_fio_do_int(ef->fio, magic))       *bOK = FALSE;
         if (magic != -7777777)
         {
-            gmx_fatal(FARGS,"Energy header magic number mismatch, this is not a GROMACS edr file");
+            enx_warning("Energy header magic number mismatch, this is not a GROMACS edr file");
+            *bOK=FALSE;
+            return FALSE;
         }
         *file_version = enx_version;
         if (!gmx_fio_do_int(ef->fio, *file_version)) *bOK = FALSE;
@@ -522,11 +537,16 @@ static gmx_bool do_eheader(ener_file_t ef,int *file_version,t_enxframe *fr,
     }
 
     if (!gmx_fio_do_int(ef->fio, fr->nblock))  *bOK = FALSE;
+    if (fr->nblock < 0) *bOK=FALSE;
 
     if (ndisre!=0)
     {
         if (*file_version >= 4)
-            gmx_incons("Distance restraint blocks in old style in new style file");
+        {
+            enx_warning("Distance restraint blocks in old style in new style file");
+            *bOK=FALSE;
+            return FALSE;
+        }
         fr->nblock+=1;
     }
 
@@ -540,8 +560,20 @@ static gmx_bool do_eheader(ener_file_t ef,int *file_version,t_enxframe *fr,
         return *bOK;
     }
 
+    /* we now know what these should be, or we've already bailed out because
+       of wrong precision */
+    if ( *file_version==1 && (fr->t < 0 || fr->t > 1e20 || fr->step < 0 ) )
+    {
+        enx_warning("edr file with negative step number or unreasonable time (and without version number).");
+        *bOK=FALSE;
+        return FALSE;
+    }
+
+
     if (*bOK && bRead)
+    {
         add_blocks_enxframe(fr, fr->nblock);
+    }
 
     startb=0;
     if (ndisre>0)
@@ -572,7 +604,9 @@ static gmx_bool do_eheader(ener_file_t ef,int *file_version,t_enxframe *fr,
             else
             {
                 if (fr->block[b].nsub != 1)
+                {
                     gmx_incons("Writing an old version .edr file with too many subblocks");
+                }
                 if (fr->block[b].sub[0].type != dtreal)
                 {
                     gmx_incons("Writing an old version .edr file the wrong subblock type");
@@ -690,7 +724,7 @@ ener_file_t open_enx(const char *fn,const char *mode)
     gmx_enxnm_t *nms=NULL;
     int        file_version=-1;
     t_enxframe *fr;
-    gmx_bool       bWrongPrecision,bDum=TRUE;
+    gmx_bool       bWrongPrecision,bOK=TRUE;
     struct ener_file *ef;
 
     snew(ef,1);
@@ -701,8 +735,8 @@ ener_file_t open_enx(const char *fn,const char *mode)
         gmx_fio_setprecision(ef->fio,FALSE);
         do_enxnms(ef,&nre,&nms);
         snew(fr,1);
-        do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bDum);
-        if(!bDum)
+        do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bOK);
+        if(!bOK)
         {
             gmx_file("Cannot read energy file header. Corrupt file?");
         }
@@ -721,8 +755,8 @@ ener_file_t open_enx(const char *fn,const char *mode)
             gmx_fio_checktype(ef->fio);
             gmx_fio_setprecision(ef->fio,TRUE);
             do_enxnms(ef,&nre,&nms);
-            do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bDum);
-            if(!bDum)
+            do_eheader(ef,&file_version,fr,nre,&bWrongPrecision,&bOK);
+            if(!bOK)
             {
                 gmx_file("Cannot write energy file header; maybe you are out of quota?");
             }
@@ -973,7 +1007,7 @@ gmx_bool do_enx(ener_file_t ef,t_enxframe *fr)
                     bOK1=gmx_fio_ndo_string(ef->fio, sub->sval, sub->nr);
                     break;
                 default:
-                    gmx_incons("Reading unknown block type");
+                    gmx_incons("Reading unknown block data type: this file is corrupted or from the future");
             }
             bOK = bOK && bOK1;
         }
index 832d6e0d2f4648e42b05adde472c07e21da787a7..e83788d870ac83321ee78ea4fc77a642e185470f 100644 (file)
@@ -184,12 +184,7 @@ const char *gmx_fio_dbgstr(t_fileio *fio, const char *desc, char *buf)
     }
     else
     {
-#if (defined( _WIN32 ) || defined( _WIN64 ) )
-        /* windows doesn't do standard C */
-#define snprintf sprintf_s
-#endif
-        snprintf(buf, GMX_FIO_BUFLEN, "  ; %s %s", 
-                 fio->comment ? fio->comment : "", desc);
+        snprintf(buf, GMX_FIO_BUFLEN, "  ; %s %s", fio->comment ? fio->comment : "", desc);
     }
     return buf;
 }
index 9d37b2465b7ad42b360777365b2bef35b3b5d555..68b4fb39ddc4dc016aaa96efff2d5c9b1c5a5489 100644 (file)
 
 */
 
-/* The optimized version of converts2ints is disabled
+/* The optimized version of converts2ints is disabled on BG/P
  * because of issues on BG/P reported in bugzilla 429
  */
-/* #if (defined __IBMC__ || defined __IBMCPP__) */
-#if (0)
+#if defined __blrts__
 
 #define convert2ints(x,xi,conv,i1,i2)                      \
     xi      = __fpctiwz(x);                                \
index ec24e41fc0c8c41bd97230570f0285649b1b5cca..1aa96e4fbbac954325cbf8c522c6237033c82b55 100644 (file)
@@ -61,11 +61,11 @@ void nb_kernel400_ia32_sse2(int *           p_nri,
                               int *           inneriter,
                               double *         work)
 {
-    int           nri,nthreads;
-    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
+  int           nri,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
        gmx_gbdata_t *gbdata;
        double *      gpol;
     
@@ -93,35 +93,35 @@ void nb_kernel400_ia32_sse2(int *           p_nri,
     
        nri        = *p_nri;
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
-    
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
                ix               = _mm_set1_pd(shX+pos[ii3+0]);
                iy               = _mm_set1_pd(shY+pos[ii3+1]);
                iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
+    
                iq               = _mm_load1_pd(charge+ii);
                iq               = _mm_mul_pd(iq,facel);
-        
+    
                isai             = _mm_load1_pd(invsqrta+ii);
                        
                vctot            = _mm_setzero_pd();
@@ -138,18 +138,18 @@ void nb_kernel400_ia32_sse2(int *           p_nri,
                        
                        j3A     = jnrA * 3;
                        j3B     = jnrB * 3;
-            
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+      
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
             
                        dx           = _mm_sub_pd(ix,jx);
                        dy           = _mm_sub_pd(iy,jy);
                        dz           = _mm_sub_pd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_pd(rinv,rinv);
-            
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
@@ -160,7 +160,7 @@ void nb_kernel400_ia32_sse2(int *           p_nri,
                        qq           = _mm_mul_pd(iq,jq);            
                        vcoul        = _mm_mul_pd(qq,rinv);
                        fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
+      vctot        = _mm_add_pd(vctot,vcoul);
             
             /* Polarization interaction */
                        qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
@@ -174,48 +174,48 @@ void nb_kernel400_ia32_sse2(int *           p_nri,
                        eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-                                   
-            fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
-            
-            /***********************************/
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+      
+      fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
@@ -224,96 +224,109 @@ void nb_kernel400_ia32_sse2(int *           p_nri,
                {
                        jnrA    = jjnr[k];
                        j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-            
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
                        dx           = _mm_sub_sd(ix,jx);
                        dy           = _mm_sub_sd(iy,jy);
                        dz           = _mm_sub_sd(iz,jz);
-            
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_sd(rinv,rinv);
-            
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
                        GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-                               
+      
                        isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
+                       qq           = _mm_mul_sd(jq,iq);            
                        vcoul        = _mm_mul_sd(qq,rinv);
                        fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
                        gbscale      = _mm_mul_sd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_sd(rsq,rinv);
                        rtab         = _mm_mul_sd(r,gbscale);
-
+      
                        n0                   = _mm_cvttpd_epi32(rtab);
                        eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
                        
-            fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
-            
-            /***********************************/
+      fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-        
-        ggid     = gid[n];         
-        
-        gmx_mm_update_1pot_pd(vctot,vc+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
-       }
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+    
+    ggid     = gid[n];         
     
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+  }
+  
        *outeriter   = nri;            
-    *inneriter   = nj1;        
+  *inneriter   = nj1;  
 }
index 759916a8e620d9905766fe3913b5d11baf484a6d..17ea0c5b3ececa81991b1208a47643dc5a0b7f50 100644 (file)
@@ -62,12 +62,12 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                                                        int *           inneriter,
                                                        double *         work)
 {
-    int           nri,ntype,nthreads;
-    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
+  int           nri,ntype,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
        int                       offset,nti;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
        int           tjA,tjB;
        gmx_gbdata_t *gbdata;
        double *      gpol;
@@ -100,37 +100,37 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
        nri        = *p_nri;
        ntype      = *p_ntype;
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
-
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
-    c6          = _mm_setzero_pd();
-    c12         = _mm_setzero_pd();
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
+  c6          = _mm_setzero_pd();
+  c12         = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
                ix               = _mm_set1_pd(shX+pos[ii3+0]);
                iy               = _mm_set1_pd(shY+pos[ii3+1]);
                iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
+    
                iq               = _mm_load1_pd(charge+ii);
                iq               = _mm_mul_pd(iq,facel);
-        
+    
                isai             = _mm_load1_pd(invsqrta+ii);
         
                nti              = 2*ntype*type[ii];
@@ -151,39 +151,39 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                        j3A     = jnrA * 3;
                        j3B     = jnrB * 3;
 
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
 
                        dx           = _mm_sub_pd(ix,jx);
                        dy           = _mm_sub_pd(iy,jy);
                        dz           = _mm_sub_pd(iz,jz);
 
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_pd(rinv,rinv);
-            
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
                        GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
             
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
                        tjB          = nti+2*type[jnrB];
-            
-            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+      
+      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
                        
                        isaprod      = _mm_mul_pd(isai,isaj);
                        qq           = _mm_mul_pd(iq,jq);            
                        vcoul        = _mm_mul_pd(qq,rinv);
                        fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
+      vctot        = _mm_add_pd(vctot,vcoul);
             
-            /* Polarization interaction */
+      /* Polarization interaction */
                        qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
                        gbscale      = _mm_mul_pd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_pd(rsq,rinv);
                        rtab         = _mm_mul_pd(r,gbscale);
@@ -192,30 +192,30 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                        eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
                        
                        rinvsix      = _mm_mul_pd(rinvsq,rinvsq);
                        rinvsix      = _mm_mul_pd(rinvsix,rinvsq);
@@ -224,26 +224,26 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                        vvdw12       = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
                        vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
             
-            fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
-                                                 _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
-                                                            _mm_mul_pd(six,vvdw6))),
-                                      _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
-                        
-            /***********************************/
+      fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
+                                           _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+                                                      _mm_mul_pd(six,vvdw6))),
+                                _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
@@ -253,39 +253,53 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                        jnrA    = jjnr[k];
                        
                        j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
             
                        dx           = _mm_sub_sd(ix,jx);
                        dy           = _mm_sub_sd(iy,jy);
                        dz           = _mm_sub_sd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_sd(rinv,rinv);
-            
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      vvdw6        = _mm_setzero_pd();
+      vvdw12       = _mm_setzero_pd();
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
                        GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-            
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
-            
-            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+      
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+      
+      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
                        
                        isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
+                       qq           = _mm_mul_sd(jq,iq);            
                        vcoul        = _mm_mul_sd(qq,rinv);
                        fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
                        gbscale      = _mm_mul_sd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_sd(rsq,rinv);
                        rtab         = _mm_mul_sd(r,gbscale);
@@ -294,70 +308,73 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                        eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
                        
                        rinvsix      = _mm_mul_sd(rinvsq,rinvsq);
                        rinvsix      = _mm_mul_sd(rinvsix,rinvsq);
                        
                        vvdw6        = _mm_mul_sd(c6,rinvsix);
                        vvdw12       = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
-                       vvdwtot      = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
-        
-            fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
-                                                 _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
-                                                            _mm_mul_sd(six,vvdw6))),
-                                      _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
-            
-            /***********************************/
+                       vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6)); /* (**) */
+      
+      fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
+                                           _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+                                                      _mm_mul_sd(six,vvdw6))),
+                                _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
-        ggid     = gid[n];         
-        
-        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+    
+    ggid     = gid[n];         
+    
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+    
        }
-
+  
        *outeriter   = nri;            
-    *inneriter   = nj1;        
+  *inneriter   = nj1;  
 }
index 94f1957748b208cf39509fdff7acd31013893426..d0ed80c135b7305e8a1ec88301f677a5d4d0e0a8 100644 (file)
@@ -60,12 +60,12 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                               int *           inneriter,
                               double *         work)
 {
-    int           nri,ntype,nthreads;
-    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
+  int           nri,ntype,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
        int                       offset,nti;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
        int           tjA,tjB;
        gmx_gbdata_t *gbdata;
        double *      gpol;
@@ -80,12 +80,12 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
        __m128d  vcoul,fscal,gbscale,c6,c12;
        __m128d  rinvsq,r,rtab;
        __m128d  eps,Y,F,G,H;
-    __m128d  VV,FF,Fp;
+  __m128d  VV,FF,Fp;
        __m128d  vgb,fijGB,dvdatmp;
        __m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
        __m128d  facel,gbtabscale,dvdaj;
-    __m128d  fijD,fijR;
-    __m128d  xmm1,tabscale,eps2;
+  __m128d  fijD,fijR;
+  __m128d  xmm1,tabscale,eps2;
        __m128i  n0, nnn;
     
        
@@ -100,40 +100,40 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
        nri        = *p_nri;
        ntype      = *p_ntype;
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
-    tabscale   = _mm_load1_pd(p_tabscale);
-    
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
-    c6          = _mm_setzero_pd();
-    c12         = _mm_setzero_pd();
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  tabscale   = _mm_load1_pd(p_tabscale);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
+  c6          = _mm_setzero_pd();
+  c12         = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
                ix               = _mm_set1_pd(shX+pos[ii3+0]);
                iy               = _mm_set1_pd(shY+pos[ii3+1]);
                iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
+    
                iq               = _mm_load1_pd(charge+ii);
                iq               = _mm_mul_pd(iq,facel);
-        
+    
                isai             = _mm_load1_pd(invsqrta+ii);
-        
+    
                nti              = 2*ntype*type[ii];
                
                vctot            = _mm_setzero_pd();
@@ -152,39 +152,39 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        j3A     = jnrA * 3;
                        j3B     = jnrB * 3;
             
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
             
                        dx           = _mm_sub_pd(ix,jx);
                        dy           = _mm_sub_pd(iy,jy);
                        dz           = _mm_sub_pd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_pd(rinv,rinv);
-            
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
                        GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
             
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
                        tjB          = nti+2*type[jnrB];
-            
-            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+      
+      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
                        
                        isaprod      = _mm_mul_pd(isai,isaj);
                        qq           = _mm_mul_pd(iq,jq);            
                        vcoul        = _mm_mul_pd(qq,rinv);
                        fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      vctot        = _mm_add_pd(vctot,vcoul);
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
                        gbscale      = _mm_mul_pd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_pd(rsq,rinv);
                        rtab         = _mm_mul_pd(r,gbscale);
@@ -193,47 +193,47 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
                        
-            /* Calculate VDW table index */
+      /* Calculate VDW table index */
                        rtab    = _mm_mul_pd(r,tabscale);
                        n0      = _mm_cvttpd_epi32(rtab);
                        eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_pd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -245,16 +245,16 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        
                        vvdw6   = _mm_mul_pd(c6,VV);
                        fijD    = _mm_mul_pd(c6,FF);
-            
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
-            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
-            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
+      
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -269,29 +269,29 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        
                        vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
                        vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
-            
+      
                        xmm1    = _mm_add_pd(fijD,fijR);
                        xmm1    = _mm_mul_pd(xmm1,tabscale);
                        xmm1    = _mm_add_pd(xmm1,fijGB);
                        xmm1    = _mm_sub_pd(xmm1,fscal);
                        fscal   = _mm_mul_pd(xmm1,neg);
                        fscal   = _mm_mul_pd(fscal,rinv);
-            
-            /***********************************/
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
@@ -300,39 +300,53 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                {
                        jnrA    = jjnr[k];
                        j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-            
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
                        dx           = _mm_sub_sd(ix,jx);
                        dy           = _mm_sub_sd(iy,jy);
                        dz           = _mm_sub_sd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_sd(rinv,rinv);
-            
-                       /***********************************/
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      vvdw6        = _mm_setzero_pd();
+      vvdw12       = _mm_setzero_pd();
+
+      /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
                        GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
             
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
-            
-            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+      
+      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
                        
                        isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
+                       qq           = _mm_mul_sd(jq,iq);            
                        vcoul        = _mm_mul_sd(qq,rinv);
                        fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
                        gbscale      = _mm_mul_sd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_sd(rsq,rinv);
                        rtab         = _mm_mul_sd(r,gbscale);
@@ -341,47 +355,47 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
                        
-            /* Calculate VDW table index */
+      /* Calculate VDW table index */
                        rtab    = _mm_mul_sd(r,tabscale);
                        n0      = _mm_cvttpd_epi32(rtab);
                        eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_sd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -393,16 +407,16 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        
                        vvdw6   = _mm_mul_sd(c6,VV);
                        fijD    = _mm_mul_sd(c6,FF);
-            
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
+      
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -416,7 +430,7 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        fijR    = _mm_mul_sd(c12,FF);
                        
                        vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
-                       vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
+                       vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */
             
                        xmm1    = _mm_add_sd(fijD,fijR);
                        xmm1    = _mm_mul_sd(xmm1,tabscale);
@@ -425,34 +439,37 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        fscal   = _mm_mul_sd(xmm1,neg);
                        fscal   = _mm_mul_sd(fscal,rinv);
 
-            /***********************************/
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-        
-        ggid     = gid[n];         
-        
-        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
-       }
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
     
+    ggid     = gid[n];         
+    
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+    
+       }
+  
        *outeriter   = nri;            
-    *inneriter   = nj1;        
+  *inneriter   = nj1;  
 }
 
index 88ae4d32362d8b641b531f80e1fb0c5426ca1fa9..a84a67946b5e96c23a27f2226d0692ed581da88f 100644 (file)
@@ -715,7 +715,7 @@ nb_kernel_allvsallgb_sse2_double(t_forcerec *           fr,
         pmask1           = prologue_mask[i+1];
         emask0           = epilogue_mask[i];
         emask1           = epilogue_mask[i+1];
-        imask_SSE0        = _mm_load1_pd((double *)(imask+i));
+        imask_SSE0        = _mm_load1_pd((double *)(imask+2*i));
         imask_SSE1        = _mm_load1_pd((double *)(imask+2*i+2));
         
          for(j=nj0; j<nj1; j+=UNROLLJ)
index e6c64c70731d6934fc0d35b28f56fa0ebaec03c8..760860bde5ca75502dbe02466c2b1f082d8e27af 100644 (file)
@@ -18,7 +18,6 @@
 #include<math.h>
 #include<vec.h>
 
-
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
 #include "../nb_kerneltype.h"
 
 
-
 void nb_kernel400_sse2_double(int *           p_nri,
-                                                       int *           iinr,
-                                                       int *           jindex,
-                                                       int *           jjnr,
-                                                       int *           shift,
-                                                       double *         shiftvec,
-                                                       double *         fshift,
-                                                       int *           gid,
-                                                       double *         pos,
-                                                       double *         faction,
-                                                       double *         charge,
-                                                       double *         p_facel,
-                                                       double *         p_krf,
-                                                       double *         p_crf,
-                                                       double *         Vc,
-                                                       int *           type,
-                                                       int *           p_ntype,
-                                                       double *         vdwparam,
-                                                       double *         Vvdw,
-                                                       double *         p_tabscale,
-                                                       double *         VFtab,
-                                                       double *         invsqrta,
-                                                       double *         dvda,
-                                                       double *         p_gbtabscale,
-                                                       double *         GBtab,
-                                                       int *           p_nthreads,
-                                                       int *           count,
-                                                       void *          mtx,
-                                                       int *           outeriter,
-                                                       int *           inneriter,
-                                                       double *         work)
+                              int *           iinr,
+                              int *           jindex,
+                              int *           jjnr,
+                              int *           shift,
+                              double *         shiftvec,
+                              double *         fshift,
+                              int *           gid,
+                              double *         pos,
+                              double *         faction,
+                              double *         charge,
+                              double *         p_facel,
+                              double *         p_krf,
+                              double *         p_crf,
+                              double *         vc,
+                              int *           type,
+                              int *           p_ntype,
+                              double *         vdwparam,
+                              double *         vvdw,
+                              double *         p_tabscale,
+                              double *         VFtab,
+                              double *         invsqrta,
+                              double *         dvda,
+                              double *         p_gbtabscale,
+                              double *         GBtab,
+                              int *           p_nthreads,
+                              int *           count,
+                              void *          mtx,
+                              int *           outeriter,
+                              int *           inneriter,
+                              double *         work)
 {
-       int           nri,ntype,nthreads,offset;
-       int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
-       double        facel,krf,crf,tabscl,gbtabscl,vct,vgbt;
-       double        shX,shY,shZ,isai_d,dva;
+  int           nri,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
        gmx_gbdata_t *gbdata;
-       float *        gpol;
-       
-       __m128d       ix,iy,iz,jx,jy,jz;
-       __m128d           dx,dy,dz,t1,t2,t3;
-       __m128d           fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
-       __m128d           q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
-       __m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
-       __m128d           xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
-       __m128d       fac,tabscale,gbtabscale;
-       __m128i       n0,nnn;
+       double *      gpol;
+    
+       __m128d  iq,qq,jq,isai;
+       __m128d  ix,iy,iz;
+       __m128d  jx,jy,jz;
+       __m128d  dx,dy,dz;
+       __m128d  vctot,vgbtot,dvdasum,gbfactor;
+       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128d  rinv,isaj,isaprod;
+       __m128d  vcoul,fscal,gbscale;
+       __m128d  rinvsq,r,rtab;
+       __m128d  eps,Y,F,G,H;
+       __m128d  vgb,fijGB,dvdatmp;
+       __m128d  facel,gbtabscale,dvdaj;
+       __m128i  n0, nnn;
        
-       const __m128d neg    = {-1.0,-1.0};
-       const __m128d zero   = {0.0,0.0};
-       const __m128d half   = {0.5,0.5};
-       const __m128d two    = {2.0,2.0};
-       const __m128d three  = {3.0,3.0};
+       const __m128d neg        = _mm_set1_pd(-1.0);
+       const __m128d zero       = _mm_set1_pd(0.0);
+       const __m128d minushalf  = _mm_set1_pd(-0.5);
+       const __m128d two        = _mm_set1_pd(2.0);
        
        gbdata     = (gmx_gbdata_t *)work;
        gpol       = gbdata->gpol;
-       
+    
        nri        = *p_nri;
-       ntype      = *p_ntype;
-       nthreads   = *p_nthreads; 
-    facel      = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));       
-       krf        = *p_krf;
-       crf        = *p_crf;
-       tabscl     = *p_tabscale;
-       gbtabscl   = *p_gbtabscale;
-       nj1        = 0;
-       
-       /* Splat variables */
-       fac        = _mm_load1_pd(&facel);
-       tabscale   = _mm_load1_pd(&tabscl);
-       gbtabscale = _mm_load1_pd(&gbtabscl);
-       
-       /* Keep compiler happy */
-       dvdatmp = _mm_setzero_pd();
-       vgb     = _mm_setzero_pd();
-       dvdaj   = _mm_setzero_pd();
-       isaj    = _mm_setzero_pd();
-       vcoul   = _mm_setzero_pd();
-       t1      = _mm_setzero_pd();
-       t2      = _mm_setzero_pd();
-       t3      = _mm_setzero_pd();
-       
-       jnr1=jnr2=0;
-       j13=j23=0;
+    
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-               is3     = 3*shift[n];
-               shX     = shiftvec[is3];
-               shY     = shiftvec[is3+1];
-               shZ     = shiftvec[is3+2];
-               
-               nj0     = jindex[n];      
-        nj1     = jindex[n+1];  
-               offset  = (nj1-nj0)%2;
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
-               ii      = iinr[n];
-               ii3     = ii*3;
-               
-               ix      = _mm_set1_pd(shX+pos[ii3+0]);
-               iy      = _mm_set1_pd(shX+pos[ii3+1]);
-               iz      = _mm_set1_pd(shX+pos[ii3+2]); 
-               q       = _mm_set1_pd(charge[ii]);
-               
-               iq      = _mm_mul_pd(fac,q); 
-               isai_d  = invsqrta[ii];
-               isai    = _mm_load1_pd(&isai_d);
-               
-               fix     = _mm_setzero_pd();
-               fiy     = _mm_setzero_pd();
-               fiz     = _mm_setzero_pd();
-               dvdasum = _mm_setzero_pd();
-               vctot   = _mm_setzero_pd();
-               vgbtot  = _mm_setzero_pd();
-               
-               for(k=nj0;k<nj1-offset; k+=2)
+               ix               = _mm_set1_pd(shX+pos[ii3+0]);
+               iy               = _mm_set1_pd(shY+pos[ii3+1]);
+               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
+    
+               iq               = _mm_load1_pd(charge+ii);
+               iq               = _mm_mul_pd(iq,facel);
+    
+               isai             = _mm_load1_pd(invsqrta+ii);
+                       
+               vctot            = _mm_setzero_pd();
+               vgbtot           = _mm_setzero_pd();
+               dvdasum          = _mm_setzero_pd();
+               fix              = _mm_setzero_pd();
+               fiy              = _mm_setzero_pd();
+               fiz              = _mm_setzero_pd();
+                
+               for(k=nj0;k<nj1-1; k+=2)
                {
-                       jnr1    = jjnr[k];
-                       jnr2    = jjnr[k+1];
-                       
-                       j13     = jnr1 * 3;
-                       j23     = jnr2 * 3;
-                       
-                       /* Load coordinates */
-                       xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
-                       xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
-                       
-                       xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
-                       xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
-                       
-                       /* transpose */
-                       jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
-                       
-                       /* distances */
-                       dx      = _mm_sub_pd(ix,jx);
-                       dy              = _mm_sub_pd(iy,jy);
-                       dz              = _mm_sub_pd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       /* Load invsqrta */
-                       isaj    = _mm_loadl_pd(isaj,invsqrta+jnr1);
-                       isaj    = _mm_loadh_pd(isaj,invsqrta+jnr2);
-                       isaprod = _mm_mul_pd(isai,isaj);
-                       
-                       /* Load charges */
-                       q               = _mm_loadl_pd(q,charge+jnr1);
-                       q               = _mm_loadh_pd(q,charge+jnr2);
-                       qq              = _mm_mul_pd(iq,q);
-                       
-                       vcoul   = _mm_mul_pd(qq,rinv);
-                       fscal   = _mm_mul_pd(vcoul,rinv);
-                       qq              = _mm_mul_pd(isaprod,qq);
-                       qq              = _mm_mul_pd(qq,neg);
-                       gbscale = _mm_mul_pd(isaprod,gbtabscale);
-                       
-                       /* Load dvdaj */
-                       dvdaj   = _mm_loadl_pd(dvdaj, dvda+jnr1);
-                       dvdaj   = _mm_loadh_pd(dvdaj, dvda+jnr2);
-                       
-                       r               = _mm_mul_pd(rsq11,rinv);
-                       rt              = _mm_mul_pd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_pd(rt,n0d);
-                       eps2    = _mm_mul_pd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G               = _mm_mul_pd(G,eps);
-                       H               = _mm_mul_pd(H,eps2);
-                       Fp              = _mm_add_pd(F,G);
-                       Fp              = _mm_add_pd(Fp,H);
-                       VV              = _mm_mul_pd(Fp,eps);
-                       VV              = _mm_add_pd(Y,VV);
-                       H               = _mm_mul_pd(two,H);
-                       FF              = _mm_add_pd(Fp,G);
-                       FF              = _mm_add_pd(FF,H);
-                       vgb             = _mm_mul_pd(qq,VV);
-                       fijC    = _mm_mul_pd(qq,FF);
-                       fijC    = _mm_mul_pd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_pd(fijC,r);
-                       dvdatmp = _mm_add_pd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_pd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_pd(dvdatmp,half);
-                       dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_pd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_pd(xmm1,isaj);
-                       dvdaj   = _mm_add_pd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       _mm_storeh_pd(dvda+jnr2,dvdaj);
-                       
-                       vctot   = _mm_add_pd(vctot,vcoul);
-                       vgbtot  = _mm_add_pd(vgbtot,vgb);
-                       
-                       fscal   = _mm_sub_pd(fijC,fscal);
-                       fscal   = _mm_mul_pd(fscal,neg);
-                       fscal   = _mm_mul_pd(fscal,rinv);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_pd(fscal,dx);
-                       t2              = _mm_mul_pd(fscal,dy);
-                       t3              = _mm_mul_pd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_pd(fix,t1);
-                       fiy             = _mm_add_pd(fiy,t2);
-                       fiz             = _mm_add_pd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm1    = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
-                       xmm2    = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-                       
-                       xmm5    = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
-                       xmm6    = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-                       
-                       /* transpose */
-                       xmm7    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
-                       xmm5    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
-                       xmm6    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_pd(xmm5,t1);
-                       xmm6    = _mm_sub_pd(xmm6,t2);
-                       xmm7    = _mm_sub_pd(xmm7,t3);
-                       
-                       xmm1    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
-                       xmm2    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* store fx and fy */
-                       _mm_storeu_pd(faction+j13,xmm1);
-                       _mm_storeu_pd(faction+j23,xmm2);
-                       
-                       /* .. then fz */
-                       _mm_storel_pd(faction+j13+2,xmm7);
-                       _mm_storeh_pd(faction+j23+2,xmm7);
+                       jnrA    = jjnr[k];
+                       jnrB    = jjnr[k+1];
+                       
+                       j3A     = jnrA * 3;
+                       j3B     = jnrB * 3;
+      
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+            
+                       dx           = _mm_sub_pd(ix,jx);
+                       dy           = _mm_sub_pd(iy,jy);
+                       dz           = _mm_sub_pd(iz,jz);
+            
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_pd(rinv,rinv);
+      
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+            
+                       isaprod      = _mm_mul_pd(isai,isaj);
+                       qq           = _mm_mul_pd(iq,jq);            
+                       vcoul        = _mm_mul_pd(qq,rinv);
+                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
+      vctot        = _mm_add_pd(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_pd(rsq,rinv);
+                       rtab         = _mm_mul_pd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+      
+      fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+      
+      /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
                /* In double precision, offset can only be either 0 or 1 */
-               if(offset!=0)
+               if(k<nj1)
                {
-                       jnr1    = jjnr[k];
-                       j13             = jnr1*3;
-                       
-                       jx      = _mm_load_sd(pos+j13);
-                       jy      = _mm_load_sd(pos+j13+1);
-                       jz      = _mm_load_sd(pos+j13+2);
-                       
-                       isaj    = _mm_load_sd(invsqrta+jnr1);
-                       isaprod = _mm_mul_sd(isai,isaj);
-                       dvdaj   = _mm_load_sd(dvda+jnr1);
-                       q               = _mm_load_sd(charge+jnr1);
-                       qq      = _mm_mul_sd(iq,q);
-                       
-                       dx      = _mm_sub_sd(ix,jx);
-                       dy              = _mm_sub_sd(iy,jy);
-                       dz              = _mm_sub_sd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       vcoul   = _mm_mul_sd(qq,rinv);
-                       fscal   = _mm_mul_sd(vcoul,rinv);
-                       qq              = _mm_mul_sd(isaprod,qq);
-                       qq              = _mm_mul_sd(qq,neg);
-                       gbscale = _mm_mul_sd(isaprod,gbtabscale);
-                       
-                       r               = _mm_mul_sd(rsq11,rinv);
-                       rt              = _mm_mul_sd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_sd(rt,n0d);
-                       eps2    = _mm_mul_sd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); 
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); 
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); 
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); 
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
-                       
-                       G               = _mm_mul_sd(G,eps);
-                       H               = _mm_mul_sd(H,eps2);
-                       Fp              = _mm_add_sd(F,G);
-                       Fp              = _mm_add_sd(Fp,H);
-                       VV              = _mm_mul_sd(Fp,eps);
-                       VV              = _mm_add_sd(Y,VV);
-                       H               = _mm_mul_sd(two,H);
-                       FF              = _mm_add_sd(Fp,G);
-                       FF              = _mm_add_sd(FF,H);
-                       vgb             = _mm_mul_sd(qq,VV);
-                       fijC    = _mm_mul_sd(qq,FF);
-                       fijC    = _mm_mul_sd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_sd(fijC,r);
-                       dvdatmp = _mm_add_sd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_sd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_sd(dvdatmp,half);
-                       dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_sd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_sd(xmm1,isaj);
-                       dvdaj   = _mm_add_sd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       
-                       vctot   = _mm_add_sd(vctot,vcoul);
-                       vgbtot  = _mm_add_sd(vgbtot,vgb);
-                       
-                       fscal   = _mm_sub_sd(fijC,fscal);
-                       fscal   = _mm_mul_sd(fscal,neg);
-                       fscal   = _mm_mul_sd(fscal,rinv);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_sd(fscal,dx);
-                       t2              = _mm_mul_sd(fscal,dy);
-                       t3              = _mm_mul_sd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_sd(fix,t1);
-                       fiy             = _mm_add_sd(fiy,t2);
-                       fiz             = _mm_add_sd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm5    = _mm_load_sd(faction+j13);   /* fx */
-                       xmm6    = _mm_load_sd(faction+j13+1); /* fy */
-                       xmm7    = _mm_load_sd(faction+j13+2); /* fz */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_sd(xmm5,t1);
-                       xmm6    = _mm_sub_sd(xmm6,t2);
-                       xmm7    = _mm_sub_sd(xmm7,t3);
-                       
-                       /* store forces */
-                       _mm_store_sd(faction+j13,xmm5);
-                       _mm_store_sd(faction+j13+1,xmm6);
-                       _mm_store_sd(faction+j13+2,xmm7);
+                       jnrA    = jjnr[k];
+                       j3A     = jnrA * 3;
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
+                       dx           = _mm_sub_sd(ix,jx);
+                       dy           = _mm_sub_sd(iy,jy);
+                       dz           = _mm_sub_sd(iz,jz);
+      
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_sd(rinv,rinv);
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+      
+                       isaprod      = _mm_mul_sd(isai,isaj);
+                       qq           = _mm_mul_sd(jq,iq);            
+                       vcoul        = _mm_mul_sd(qq,rinv);
+                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
+                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+      
+                       /* Calculate GB table index */
+                       r            = _mm_mul_sd(rsq,rinv);
+                       rtab         = _mm_mul_sd(r,gbscale);
+      
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+                       
+      fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+      
+      /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-               /* fix/fiy/fiz now contain four partial terms, that all should be
-                * added to the i particle forces
-                */
-               t1               = _mm_unpacklo_pd(t1,fix);
-               t2               = _mm_unpacklo_pd(t2,fiy);
-               t3               = _mm_unpacklo_pd(t3,fiz);
-               
-               fix              = _mm_add_pd(fix,t1);
-               fiy              = _mm_add_pd(fiy,t2);
-               fiz              = _mm_add_pd(fiz,t3);
-               
-               fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
-               fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
-               fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-               
-               /* Load i forces from memory */
-               xmm1     = _mm_load_sd(faction+ii3);
-               xmm2     = _mm_load_sd(faction+ii3+1);
-               xmm3     = _mm_load_sd(faction+ii3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i forces to memory */
-               _mm_store_sd(faction+ii3,fix);
-               _mm_store_sd(faction+ii3+1,fiy);
-               _mm_store_sd(faction+ii3+2,fiz);
-               
-               /* now do dvda */
-               dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
-               dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
-               _mm_storeh_pd(&dva,dvdasum);
-               dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-               
-               ggid     = gid[n];
-               
-               /* Coulomb potential */
-               vcoul    = _mm_unpacklo_pd(vcoul,vctot);
-               vctot    = _mm_add_pd(vctot,vcoul);
-               _mm_storeh_pd(&vct,vctot);
-               Vc[ggid] = Vc[ggid] + vct;
-               
-               /* GB potential */
-               vgb      = _mm_unpacklo_pd(vgb,vgbtot);
-               vgbtot   = _mm_add_pd(vgbtot,vgb);
-               _mm_storeh_pd(&vgbt,vgbtot);
-               gpol[ggid] = gpol[ggid] + vgbt;
-       }
-       
-       *outeriter   = nri;            
-    *inneriter   = nj1;        
-}
-
-
-/*
- * Gromacs nonbonded kernel nb_kernel400nf
- * Coulomb interaction:     Generalized-Born
- * VdW interaction:         Not calculated
- * water optimization:      No
- * Calculate forces:        no
- */
-void nb_kernel400nf_sse2_double(
-                    int *           p_nri,
-                    int *           iinr,
-                    int *           jindex,
-                    int *           jjnr,
-                    int *           shift,
-                    double *         shiftvec,
-                    double *         fshift,
-                    int *           gid,
-                    double *         pos,
-                    double *         faction,
-                    double *         charge,
-                    double *         p_facel,
-                    double *         p_krf,
-                    double *         p_crf,
-                    double *         Vc,
-                    int *           type,
-                    int *           p_ntype,
-                    double *         vdwparam,
-                    double *         Vvdw,
-                    double *         p_tabscale,
-                    double *         VFtab,
-                    double *         invsqrta,
-                    double *         dvda,
-                    double *         p_gbtabscale,
-                    double *         GBtab,
-                    int *           p_nthreads,
-                    int *           count,
-                    void *          mtx,
-                    int *           outeriter,
-                    int *           inneriter,
-                    double *         work)
-{
-    int           nri,ntype,nthreads;
-    double         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
-    int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
-    double         shX,shY,shZ;
-    double         iq;
-    double         qq,vcoul,vctot;
-    double         r,rt,eps,eps2;
-    int           n0,nnn;
-    double         Y,F,Geps,Heps2,Fp,VV;
-    double         isai,isaj,isaprod,gbscale;
-    double         ix1,iy1,iz1;
-    double         jx1,jy1,jz1;
-    double         dx11,dy11,dz11,rsq11,rinv11;
-    const int     fractshift = 12;
-    const int     fractmask = 8388607;
-    const int     expshift = 23;
-    const int     expmask = 2139095040;
-    const int     explsb = 8388608;
-    double         lu;
-    int           iexp,addr;
-    union { unsigned int bval; double fval; } bitpattern,result;
-       
-    nri              = *p_nri;         
-    ntype            = *p_ntype;       
-    nthreads         = *p_nthreads;    
-    facel            = *p_facel;       
-    krf              = *p_krf;         
-    crf              = *p_crf;         
-    tabscale         = *p_tabscale;    
-    gbtabscale       = *p_gbtabscale;  
-    nj1              = 0;              
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
     
-    for(n=0; (n<nri); n++)
-    {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
-        ix1              = shX + pos[ii3+0];
-        iy1              = shY + pos[ii3+1];
-        iz1              = shZ + pos[ii3+2];
-        iq               = facel*charge[ii];
-        isai             = invsqrta[ii];   
-        vctot            = 0;              
-        
-        for(k=nj0; (k<nj1); k++)
-        {
-            jnr              = jjnr[k];        
-            j3               = 3*jnr;          
-            jx1              = pos[j3+0];      
-            jy1              = pos[j3+1];      
-            jz1              = pos[j3+2];      
-            dx11             = ix1 - jx1;      
-            dy11             = iy1 - jy1;      
-            dz11             = iz1 - jz1;      
-            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
-            bitpattern.fval  = rsq11;          
-            iexp             = (((bitpattern.bval)&expmask)>>expshift);
-            addr             = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
-            result.bval      = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
-            lu               = result.fval;    
-            rinv11           = (0.5*lu*(3.0-((rsq11*lu)*lu)));
-            isaj             = invsqrta[jnr];  
-            isaprod          = isai*isaj;      
-            qq               = iq*charge[jnr]; 
-            vcoul            = qq*rinv11;      
-            qq               = isaprod*(-qq);  
-            gbscale          = isaprod*gbtabscale;
-            r                = rsq11*rinv11;   
-            rt               = r*gbscale;      
-            n0               = rt;             
-            eps              = rt-n0;          
-            eps2             = eps*eps;        
-            nnn              = 4*n0;           
-            Y                = GBtab[nnn];     
-            F                = GBtab[nnn+1];   
-            Geps             = eps*GBtab[nnn+2];
-            Heps2            = eps2*GBtab[nnn+3];
-            Fp               = F+Geps+Heps2;   
-            VV               = Y+eps*Fp;       
-            vgb              = qq*VV;          
-            vctot            = vctot + vcoul;  
-        }
-        
-        ggid             = gid[n];         
-        Vc[ggid]         = Vc[ggid] + vctot;
-    }
+    ggid     = gid[n];         
     
-    *outeriter       = nri;            
-    *inneriter       = nj1;            
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+  }
+  
+       *outeriter   = nri;            
+  *inneriter   = nj1;  
 }
-
-
index d5e17e7f790598ec1b8edf8c7a4c4468e898ea29..2a65f5208be7cf501a0f9b338e2b8a7375cfa64b 100644 (file)
 #include "../nb_kerneltype.h"
 
 
+
 void nb_kernel410_sse2_double(int *           p_nri,
-                    int *           iinr,
-                    int *           jindex,
-                    int *           jjnr,
-                    int *           shift,
-                    double *         shiftvec,
-                    double *         fshift,
-                    int *           gid,
-                    double *         pos,
-                    double *         faction,
-                    double *         charge,
-                    double *         p_facel,
-                    double *         p_krf,
-                    double *         p_crf,
-                    double *         Vc,
-                    int *           type,
-                    int *           p_ntype,
-                    double *         vdwparam,
-                    double *         Vvdw,
-                    double *         p_tabscale,
-                    double *         VFtab,
-                    double *         invsqrta,
-                    double *         dvda,
-                    double *         p_gbtabscale,
-                    double *         GBtab,
-                    int *           p_nthreads,
-                    int *           count,
-                    void *          mtx,
-                    int *           outeriter,
-                    int *           inneriter,
-                    double *         work)
+                                                       int *           iinr,
+                                                       int *           jindex,
+                                                       int *           jjnr,
+                                                       int *           shift,
+                                                       double *         shiftvec,
+                                                       double *         fshift,
+                                                       int *           gid,
+                                                       double *         pos,
+                                                       double *         faction,
+                                                       double *         charge,
+                                                       double *         p_facel,
+                                                       double *         p_krf,
+                                                       double *         p_crf,
+                                                       double *         vc,
+                                                       int *           type,
+                                                       int *           p_ntype,
+                                                       double *         vdwparam,
+                                                       double *         vvdw,
+                                                       double *         p_tabscale,
+                                                       double *         VFtab,
+                                                       double *         invsqrta,
+                                                       double *         dvda,
+                                                       double *         p_gbtabscale,
+                                                       double *         GBtab,
+                                                       int *           p_nthreads,
+                                                       int *           count,
+                                                       void *          mtx,
+                                                       int *           outeriter,
+                                                       int *           inneriter,
+                                                       double *         work)
 {
-       int           nri,ntype,nthreads,offset,tj,tj2,nti;
-       int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
-       double        facel,krf,crf,tabscl,gbtabscl,vct,vdwt,nt1,nt2;
-       double        shX,shY,shZ,isai_d,dva,vgbt;
+  int           nri,ntype,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
+       int                       offset,nti;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
+       int           tjA,tjB;
        gmx_gbdata_t *gbdata;
-       float *        gpol;
-
-       __m128d       ix,iy,iz,jx,jy,jz;
-       __m128d           dx,dy,dz,t1,t2,t3;
-       __m128d           fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
-       __m128d           q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
-       __m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
-       __m128d           xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
-       __m128d       c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,rinvsq,rinvsix;
-       __m128d       fac,tabscale,gbtabscale;
-       __m128i       n0,nnn;
+       double *      gpol;
+    
+       __m128d  iq,qq,jq,isai;
+       __m128d  ix,iy,iz;
+       __m128d  jx,jy,jz;
+       __m128d  dx,dy,dz;
+       __m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128d  rinv,isaj,isaprod;
+       __m128d  vcoul,fscal,gbscale,c6,c12;
+       __m128d  rinvsq,r,rtab;
+       __m128d  eps,Y,F,G,H;
+       __m128d  vgb,fijGB,dvdatmp;
+       __m128d  rinvsix,vvdw6,vvdw12;
+       __m128d  facel,gbtabscale,dvdaj;
+       __m128i  n0, nnn;
        
-       const __m128d neg    = {-1.0,-1.0};
-       const __m128d zero   = {0.0,0.0};
-       const __m128d half   = {0.5,0.5};
-       const __m128d two    = {2.0,2.0};
-       const __m128d three  = {3.0,3.0};
-       const __m128d six    = {6.0,6.0};
-       const __m128d twelwe = {12.0,12.0};
+       const __m128d neg        = _mm_set1_pd(-1.0);
+       const __m128d zero       = _mm_set1_pd(0.0);
+       const __m128d minushalf  = _mm_set1_pd(-0.5);
+       const __m128d two        = _mm_set1_pd(2.0);
+       const __m128d six        = _mm_set1_pd(6.0);
+       const __m128d twelve     = _mm_set1_pd(12.0);
        
        gbdata     = (gmx_gbdata_t *)work;
        gpol       = gbdata->gpol;
-       
+
        nri        = *p_nri;
        ntype      = *p_ntype;
-       nthreads   = *p_nthreads; 
-    facel      = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));       
-       krf        = *p_krf;
-       crf        = *p_crf;
-       tabscl     = *p_tabscale;
-       gbtabscl   = *p_gbtabscale;
-       nj1        = 0;
-       
-       /* Splat variables */
-       fac        = _mm_load1_pd(&facel);
-       tabscale   = _mm_load1_pd(&tabscl);
-       gbtabscale = _mm_load1_pd(&gbtabscl);
-       
-       /* Keep compiler happy */
-       Vvdwtmp = _mm_setzero_pd();
-       Vvdwtot = _mm_setzero_pd();
-       dvdatmp = _mm_setzero_pd();
-       dvdaj   = _mm_setzero_pd();
-       isaj    = _mm_setzero_pd();
-       vcoul   = _mm_setzero_pd();
-       vgb     = _mm_setzero_pd();
-       t1      = _mm_setzero_pd();
-       t2      = _mm_setzero_pd();
-       t3      = _mm_setzero_pd();
-       xmm1    = _mm_setzero_pd();
-       xmm2    = _mm_setzero_pd();
-       xmm3    = _mm_setzero_pd();
-       xmm4    = _mm_setzero_pd();
-       jnr1    = jnr2 = 0;
-       j13     = j23  = 0;
+    
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
+  c6          = _mm_setzero_pd();
+  c12         = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-               is3     = 3*shift[n];
-               shX     = shiftvec[is3];
-               shY     = shiftvec[is3+1];
-               shZ     = shiftvec[is3+2];
-               
-               nj0     = jindex[n];      
-        nj1     = jindex[n+1];  
-               offset  = (nj1-nj0)%2;
-               
-               ii      = iinr[n];
-               ii3     = ii*3;
-               
-               ix      = _mm_set1_pd(shX+pos[ii3+0]);
-               iy      = _mm_set1_pd(shX+pos[ii3+1]);
-               iz      = _mm_set1_pd(shX+pos[ii3+2]); 
-               q       = _mm_set1_pd(charge[ii]);
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
-               iq      = _mm_mul_pd(fac,q); 
-               isai_d  = invsqrta[ii];
-               isai    = _mm_load1_pd(&isai_d);
-               
-               nti      = 2*ntype*type[ii];
-               
-               fix     = _mm_setzero_pd();
-               fiy     = _mm_setzero_pd();
-               fiz     = _mm_setzero_pd();
-               dvdasum = _mm_setzero_pd();
-               vctot   = _mm_setzero_pd();
-               vgbtot  = _mm_setzero_pd();
-               Vvdwtot = _mm_setzero_pd();
+               ix               = _mm_set1_pd(shX+pos[ii3+0]);
+               iy               = _mm_set1_pd(shY+pos[ii3+1]);
+               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
+    
+               iq               = _mm_load1_pd(charge+ii);
+               iq               = _mm_mul_pd(iq,facel);
+    
+               isai             = _mm_load1_pd(invsqrta+ii);
+        
+               nti              = 2*ntype*type[ii];
                
-               for(k=nj0;k<nj1-offset; k+=2)
+               vctot            = _mm_setzero_pd();
+               vvdwtot          = _mm_setzero_pd();
+               vgbtot           = _mm_setzero_pd();
+               dvdasum          = _mm_setzero_pd();
+               fix              = _mm_setzero_pd();
+               fiy              = _mm_setzero_pd();
+               fiz              = _mm_setzero_pd();
+        
+               for(k=nj0;k<nj1-1; k+=2)
                {
-                       jnr1    = jjnr[k];
-                       jnr2    = jjnr[k+1];
-                       
-                       j13     = jnr1 * 3;
-                       j23     = jnr2 * 3;
-                       
-                       /* Load coordinates */
-                       xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
-                       xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
-                       
-                       xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
-                       xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
-                       
-                       /* transpose */
-                       jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
-                       
-                       /* distances */
-                       dx      = _mm_sub_pd(ix,jx);
-                       dy              = _mm_sub_pd(iy,jy);
-                       dz              = _mm_sub_pd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       /* Load invsqrta */
-                       isaj    = _mm_loadl_pd(isaj,invsqrta+jnr1);
-                       isaj    = _mm_loadh_pd(isaj,invsqrta+jnr2);
-                       isaprod = _mm_mul_pd(isai,isaj);
-                       
-                       /* Load charges */
-                       q               = _mm_loadl_pd(q,charge+jnr1);
-                       q               = _mm_loadh_pd(q,charge+jnr2);
-                       qq              = _mm_mul_pd(iq,q);
-                       
-                       vcoul   = _mm_mul_pd(qq,rinv);
-                       fscal   = _mm_mul_pd(vcoul,rinv);
-                       qq              = _mm_mul_pd(isaprod,qq);
-                       qq              = _mm_mul_pd(qq,neg);
-                       gbscale = _mm_mul_pd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       tj2     = nti+2*type[jnr2];
+                       jnrA    = jjnr[k];
+                       jnrB    = jjnr[k+1];
                        
-                       xmm1      = _mm_loadu_pd(vdwparam+tj);
-                       xmm2     = _mm_loadu_pd(vdwparam+tj2);
-                       c6      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
-                       c12     = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-                       
-                       rinvsq  = _mm_mul_pd(rinv,rinv);
-                       
-                       /* Load dvdaj */
-                       dvdaj   = _mm_loadl_pd(dvdaj, dvda+jnr1);
-                       dvdaj   = _mm_loadh_pd(dvdaj, dvda+jnr2);
-                       
-                       r               = _mm_mul_pd(rsq11,rinv);
-                       rt              = _mm_mul_pd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_pd(rt,n0d);
-                       eps2    = _mm_mul_pd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G               = _mm_mul_pd(G,eps);
-                       H               = _mm_mul_pd(H,eps2);
-                       Fp              = _mm_add_pd(F,G);
-                       Fp              = _mm_add_pd(Fp,H);
-                       VV              = _mm_mul_pd(Fp,eps);
-                       VV              = _mm_add_pd(Y,VV);
-                       H               = _mm_mul_pd(two,H);
-                       FF              = _mm_add_pd(Fp,G);
-                       FF              = _mm_add_pd(FF,H);
-                       vgb             = _mm_mul_pd(qq,VV);
-                       fijC    = _mm_mul_pd(qq,FF);
-                       fijC    = _mm_mul_pd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_pd(fijC,r);
-                       dvdatmp = _mm_add_pd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_pd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_pd(dvdatmp,half);
-                       dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_pd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_pd(xmm1,isaj);
-                       dvdaj   = _mm_add_pd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       _mm_storeh_pd(dvda+jnr2,dvdaj);
-                       
-                       vctot   = _mm_add_pd(vctot,vcoul);
-                       vgbtot  = _mm_add_pd(vgbtot,vgb);
-                       
-                       /* VdW interaction */
-                       rinvsix = _mm_mul_pd(rinvsq,rinvsq);
-                       rinvsix = _mm_mul_pd(rinvsix,rinvsq);
-                       
-                       Vvdw6   = _mm_mul_pd(c6,rinvsix);
-                       Vvdw12  = _mm_mul_pd(c12,rinvsix);
-                       Vvdw12  = _mm_mul_pd(Vvdw12,rinvsix);
-                       Vvdwtmp = _mm_sub_pd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-                       
-                       xmm1    = _mm_mul_pd(twelwe,Vvdw12);
-                       xmm2    = _mm_mul_pd(six,Vvdw6);
-                       xmm1    = _mm_sub_pd(xmm1,xmm2);
-                       xmm1    = _mm_mul_pd(xmm1,rinvsq);
-                       
-                       /* Scalar force */
-                       fscal   = _mm_sub_pd(fijC,fscal);
-                       fscal   = _mm_mul_pd(fscal,rinv);
-                       fscal   = _mm_sub_pd(xmm1,fscal);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_pd(fscal,dx);
-                       t2              = _mm_mul_pd(fscal,dy);
-                       t3              = _mm_mul_pd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_pd(fix,t1);
-                       fiy             = _mm_add_pd(fiy,t2);
-                       fiz             = _mm_add_pd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm1    = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
-                       xmm2    = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-                       
-                       xmm5    = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
-                       xmm6    = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-                       
-                       /* transpose */
-                       xmm7    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
-                       xmm5    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
-                       xmm6    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_pd(xmm5,t1);
-                       xmm6    = _mm_sub_pd(xmm6,t2);
-                       xmm7    = _mm_sub_pd(xmm7,t3);
-                       
-                       xmm1    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
-                       xmm2    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* store fx and fy */
-                       _mm_storeu_pd(faction+j13,xmm1);
-                       _mm_storeu_pd(faction+j23,xmm2);
-                       
-                       /* .. then fz */
-                       _mm_storel_pd(faction+j13+2,xmm7);
-                       _mm_storeh_pd(faction+j23+2,xmm7);
+                       j3A     = jnrA * 3;
+                       j3B     = jnrB * 3;
+
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+                       dx           = _mm_sub_pd(ix,jx);
+                       dy           = _mm_sub_pd(iy,jy);
+                       dz           = _mm_sub_pd(iz,jz);
+
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_pd(rinv,rinv);
+      
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+            
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+                       tjB          = nti+2*type[jnrB];
+      
+      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+                       
+                       isaprod      = _mm_mul_pd(isai,isaj);
+                       qq           = _mm_mul_pd(iq,jq);            
+                       vcoul        = _mm_mul_pd(qq,rinv);
+                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
+      vctot        = _mm_add_pd(vctot,vcoul);
+            
+      /* Polarization interaction */
+                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
+      
+                       /* Calculate GB table index */
+                       r            = _mm_mul_pd(rsq,rinv);
+                       rtab         = _mm_mul_pd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+                       
+                       rinvsix      = _mm_mul_pd(rinvsq,rinvsq);
+                       rinvsix      = _mm_mul_pd(rinvsix,rinvsq);
+                       
+                       vvdw6        = _mm_mul_pd(c6,rinvsix);
+                       vvdw12       = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
+                       vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
+            
+      fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
+                                           _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+                                                      _mm_mul_pd(six,vvdw6))),
+                                _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+      
+      /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
                /* In double precision, offset can only be either 0 or 1 */
-               if(offset!=0)
+               if(k<nj1)
                {
-                       jnr1    = jjnr[k];
-                       j13             = jnr1*3;
-                       
-                       jx      = _mm_load_sd(pos+j13);
-                       jy      = _mm_load_sd(pos+j13+1);
-                       jz      = _mm_load_sd(pos+j13+2);
-                       
-                       isaj    = _mm_load_sd(invsqrta+jnr1);
-                       isaprod = _mm_mul_sd(isai,isaj);
-                       dvdaj   = _mm_load_sd(dvda+jnr1);
-                       q               = _mm_load_sd(charge+jnr1);
-                       qq      = _mm_mul_sd(iq,q);
-                       
-                       dx      = _mm_sub_sd(ix,jx);
-                       dy              = _mm_sub_sd(iy,jy);
-                       dz              = _mm_sub_sd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       vcoul   = _mm_mul_sd(qq,rinv);
-                       fscal   = _mm_mul_sd(vcoul,rinv);
-                       qq              = _mm_mul_sd(isaprod,qq);
-                       qq              = _mm_mul_sd(qq,neg);
-                       gbscale = _mm_mul_sd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       
-                       c6      = _mm_load_sd(vdwparam+tj);
-                       c12     = _mm_load_sd(vdwparam+tj+1);
-                       
-                       rinvsq  = _mm_mul_sd(rinv,rinv);
-                       
-                       r               = _mm_mul_sd(rsq11,rinv);
-                       rt              = _mm_mul_sd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_sd(rt,n0d);
-                       eps2    = _mm_mul_sd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); 
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); 
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); 
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); 
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
-                       
-                       G               = _mm_mul_sd(G,eps);
-                       H               = _mm_mul_sd(H,eps2);
-                       Fp              = _mm_add_sd(F,G);
-                       Fp              = _mm_add_sd(Fp,H);
-                       VV              = _mm_mul_sd(Fp,eps);
-                       VV              = _mm_add_sd(Y,VV);
-                       H               = _mm_mul_sd(two,H);
-                       FF              = _mm_add_sd(Fp,G);
-                       FF              = _mm_add_sd(FF,H);
-                       vgb             = _mm_mul_sd(qq,VV);
-                       fijC    = _mm_mul_sd(qq,FF);
-                       fijC    = _mm_mul_sd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_sd(fijC,r);
-                       dvdatmp = _mm_add_sd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_sd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_sd(dvdatmp,half);
-                       dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_sd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_sd(xmm1,isaj);
-                       dvdaj   = _mm_add_sd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       
-                       vctot   = _mm_add_sd(vctot,vcoul);
-                       vgbtot  = _mm_add_sd(vgbtot,vgb);
-                       
-                       /* VdW interaction */
-                       rinvsix = _mm_mul_sd(rinvsq,rinvsq);
-                       rinvsix = _mm_mul_sd(rinvsix,rinvsq);
-                       
-                       Vvdw6   = _mm_mul_sd(c6,rinvsix);
-                       Vvdw12  = _mm_mul_sd(c12,rinvsix);
-                       Vvdw12  = _mm_mul_sd(Vvdw12,rinvsix);
-                       Vvdwtmp = _mm_sub_sd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-                       
-                       xmm1    = _mm_mul_sd(twelwe,Vvdw12);
-                       xmm2    = _mm_mul_sd(six,Vvdw6);
-                       xmm1    = _mm_sub_sd(xmm1,xmm2);
-                       xmm1    = _mm_mul_sd(xmm1,rinvsq);
-                       
-                       /* Scalar force */
-                       fscal   = _mm_sub_sd(fijC,fscal);
-                       fscal   = _mm_mul_sd(fscal,rinv);
-                       fscal   = _mm_sub_sd(xmm1,fscal);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_sd(fscal,dx);
-                       t2              = _mm_mul_sd(fscal,dy);
-                       t3              = _mm_mul_sd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_sd(fix,t1);
-                       fiy             = _mm_add_sd(fiy,t2);
-                       fiz             = _mm_add_sd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm5    = _mm_load_sd(faction+j13);   /* fx */
-                       xmm6    = _mm_load_sd(faction+j13+1); /* fy */
-                       xmm7    = _mm_load_sd(faction+j13+2); /* fz */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_sd(xmm5,t1);
-                       xmm6    = _mm_sub_sd(xmm6,t2);
-                       xmm7    = _mm_sub_sd(xmm7,t3);
-                       
-                       /* store forces */
-                       _mm_store_sd(faction+j13,xmm5);
-                       _mm_store_sd(faction+j13+1,xmm6);
-                       _mm_store_sd(faction+j13+2,xmm7);
+                       jnrA    = jjnr[k];
+                       
+                       j3A     = jnrA * 3;
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+            
+                       dx           = _mm_sub_sd(ix,jx);
+                       dy           = _mm_sub_sd(iy,jy);
+                       dz           = _mm_sub_sd(iz,jz);
+            
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_sd(rinv,rinv);
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      vvdw6        = _mm_setzero_pd();
+      vvdw12       = _mm_setzero_pd();
+      
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+      
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+      
+      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+                       
+                       isaprod      = _mm_mul_sd(isai,isaj);
+                       qq           = _mm_mul_sd(jq,iq);            
+                       vcoul        = _mm_mul_sd(qq,rinv);
+                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
+                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+      
+                       /* Calculate GB table index */
+                       r            = _mm_mul_sd(rsq,rinv);
+                       rtab         = _mm_mul_sd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+                       
+                       rinvsix      = _mm_mul_sd(rinvsq,rinvsq);
+                       rinvsix      = _mm_mul_sd(rinvsix,rinvsq);
+                       
+                       vvdw6        = _mm_mul_sd(c6,rinvsix);
+                       vvdw12       = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
+                       vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6)); /* (**) */
+      
+      fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
+                                           _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+                                                      _mm_mul_sd(six,vvdw6))),
+                                _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+      
+      /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-               /* fix/fiy/fiz now contain four partial terms, that all should be
-                * added to the i particle forces
-                */
-               t1               = _mm_unpacklo_pd(t1,fix);
-               t2               = _mm_unpacklo_pd(t2,fiy);
-               t3               = _mm_unpacklo_pd(t3,fiz);
-               
-               fix              = _mm_add_pd(fix,t1);
-               fiy              = _mm_add_pd(fiy,t2);
-               fiz              = _mm_add_pd(fiz,t3);
-               
-               fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
-               fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
-               fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-               
-               /* Load i forces from memory */
-               xmm1     = _mm_load_sd(faction+ii3);
-               xmm2     = _mm_load_sd(faction+ii3+1);
-               xmm3     = _mm_load_sd(faction+ii3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i forces to memory */
-               _mm_store_sd(faction+ii3,fix);
-               _mm_store_sd(faction+ii3+1,fiy);
-               _mm_store_sd(faction+ii3+2,fiz);
-               
-               /* now do dvda */
-               dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
-               dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
-               _mm_storeh_pd(&dva,dvdasum);
-               dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-               
-               ggid     = gid[n];
-               
-               /* Coulomb potential */
-               vcoul    = _mm_unpacklo_pd(vcoul,vctot);
-               vctot    = _mm_add_pd(vctot,vcoul);
-               _mm_storeh_pd(&vct,vctot);
-               Vc[ggid] = Vc[ggid] + vct;
-               
-               /* VdW potential */
-               Vvdwtmp  = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
-               Vvdwtot  = _mm_add_pd(Vvdwtot,Vvdwtmp);
-               _mm_storeh_pd(&vdwt,Vvdwtot);
-               Vvdw[ggid] = Vvdw[ggid] + vdwt;
-               
-               /* GB potential */
-               vgb      = _mm_unpacklo_pd(vgb,vgbtot);
-               vgbtot   = _mm_add_pd(vgbtot,vgb);
-               _mm_storeh_pd(&vgbt,vgbtot);
-               gpol[ggid] = gpol[ggid] + vgbt;
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+    
+    ggid     = gid[n];         
+    
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+    
        }
-       
+  
        *outeriter   = nri;            
-    *inneriter   = nj1;        
+  *inneriter   = nj1;  
 }
-
-
-
-/*
- * Gromacs nonbonded kernel nb_kernel410nf
- * Coulomb interaction:     Generalized-Born
- * VdW interaction:         Lennard-Jones
- * water optimization:      No
- * Calculate forces:        no
- */
-void nb_kernel410nf_sse2_double(
-                    int *           p_nri,
-                    int *           iinr,
-                    int *           jindex,
-                    int *           jjnr,
-                    int *           shift,
-                    double *         shiftvec,
-                    double *         fshift,
-                    int *           gid,
-                    double *         pos,
-                    double *         faction,
-                    double *         charge,
-                    double *         p_facel,
-                    double *         p_krf,
-                    double *         p_crf,
-                    double *         Vc,
-                    int *           type,
-                    int *           p_ntype,
-                    double *         vdwparam,
-                    double *         Vvdw,
-                    double *         p_tabscale,
-                    double *         VFtab,
-                    double *         invsqrta,
-                    double *         dvda,
-                    double *         p_gbtabscale,
-                    double *         GBtab,
-                    int *           p_nthreads,
-                    int *           count,
-                    void *          mtx,
-                    int *           outeriter,
-                    int *           inneriter,
-                    double *         work)
-{
-    int           nri,ntype,nthreads;
-    double         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
-    int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
-    double         shX,shY,shZ;
-    double         rinvsq;
-    double         iq;
-    double         qq,vcoul,vctot;
-    int           nti;
-    int           tj;
-    double         rinvsix;
-    double         Vvdw6,Vvdwtot;
-    double         Vvdw12;
-    double         r,rt,eps,eps2;
-    int           n0,nnn;
-    double         Y,F,Geps,Heps2,Fp,VV;
-    double         isai,isaj,isaprod,gbscale;
-    double         ix1,iy1,iz1;
-    double         jx1,jy1,jz1;
-    double         dx11,dy11,dz11,rsq11,rinv11;
-    double         c6,c12;
-    const int     fractshift = 12;
-    const int     fractmask = 8388607;
-    const int     expshift = 23;
-    const int     expmask = 2139095040;
-    const int     explsb = 8388608;
-    double         lu;
-    int           iexp,addr;
-    union { unsigned int bval; double fval; } bitpattern,result;
-
-    nri              = *p_nri;         
-    ntype            = *p_ntype;       
-    nthreads         = *p_nthreads;    
-    facel            = *p_facel;       
-    krf              = *p_krf;         
-    crf              = *p_crf;         
-    tabscale         = *p_tabscale;    
-    gbtabscale       = *p_gbtabscale;  
-    nj1              = 0;              
-
-    for(n=0; (n<nri); n++)
-    {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
-        ix1              = shX + pos[ii3+0];
-        iy1              = shY + pos[ii3+1];
-        iz1              = shZ + pos[ii3+2];
-        iq               = facel*charge[ii];
-        isai             = invsqrta[ii];   
-        nti              = 2*ntype*type[ii];
-        vctot            = 0;              
-        Vvdwtot          = 0;              
-        
-        for(k=nj0; (k<nj1); k++)
-        {
-            jnr              = jjnr[k];        
-            j3               = 3*jnr;          
-            jx1              = pos[j3+0];      
-            jy1              = pos[j3+1];      
-            jz1              = pos[j3+2];      
-            dx11             = ix1 - jx1;      
-            dy11             = iy1 - jy1;      
-            dz11             = iz1 - jz1;      
-            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
-            bitpattern.fval  = rsq11;          
-            iexp             = (((bitpattern.bval)&expmask)>>expshift);
-            addr             = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
-            result.bval      = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
-            lu               = result.fval;    
-            rinv11           = (0.5*lu*(3.0-((rsq11*lu)*lu)));
-            isaj             = invsqrta[jnr];  
-            isaprod          = isai*isaj;      
-            qq               = iq*charge[jnr]; 
-            vcoul            = qq*rinv11;      
-            qq               = isaprod*(-qq);  
-            gbscale          = isaprod*gbtabscale;
-            tj               = nti+2*type[jnr];
-            c6               = vdwparam[tj];   
-            c12              = vdwparam[tj+1]; 
-            rinvsq           = rinv11*rinv11;  
-            r                = rsq11*rinv11;   
-            rt               = r*gbscale;      
-            n0               = rt;             
-            eps              = rt-n0;          
-            eps2             = eps*eps;        
-            nnn              = 4*n0;           
-            Y                = GBtab[nnn];     
-            F                = GBtab[nnn+1];   
-            Geps             = eps*GBtab[nnn+2];
-            Heps2            = eps2*GBtab[nnn+3];
-            Fp               = F+Geps+Heps2;   
-            VV               = Y+eps*Fp;       
-            vgb              = qq*VV;          
-            vctot            = vctot + vcoul;  
-            rinvsix          = rinvsq*rinvsq*rinvsq;
-            Vvdw6            = c6*rinvsix;     
-            Vvdw12           = c12*rinvsix*rinvsix;
-            Vvdwtot          = Vvdwtot+Vvdw12-Vvdw6;
-        }
-        
-        ggid             = gid[n];         
-        Vc[ggid]         = Vc[ggid] + vctot;
-        Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
-    }
-    
-    *outeriter       = nri;            
-    *inneriter       = nj1;            
-}
-
-
index 2532a8576650ce79d794a1e51784a8489388a831..2df2ed04c32857d9fe973dabe5cfdb941c071fce 100644 (file)
@@ -18,7 +18,6 @@
 #include<math.h>
 #include<vec.h>
 
-
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-
+#include "nb_kernel430_x86_64_sse2.h"
 
 void nb_kernel430_sse2_double(int *           p_nri,
-                                                       int *           iinr,
-                                                       int *           jindex,
-                                                       int *           jjnr,
-                                                       int *           shift,
-                                                       double *         shiftvec,
-                                                       double *         fshift,
-                                                       int *           gid,
-                                                       double *         pos,
-                                                       double *         faction,
-                                                       double *         charge,
-                                                       double *         p_facel,
-                                                       double *         p_krf,
-                                                       double *         p_crf,
-                                                       double *         Vc,
-                                                       int *           type,
-                                                       int *           p_ntype,
-                                                       double *         vdwparam,
-                                                       double *         Vvdw,
-                                                       double *         p_tabscale,
-                                                       double *         VFtab,
-                                                       double *         invsqrta,
-                                                       double *         dvda,
-                                                       double *         p_gbtabscale,
-                                                       double *         GBtab,
-                                                       int *           p_nthreads,
-                                                       int *           count,
-                                                       void *          mtx,
-                                                       int *           outeriter,
-                                                       int *           inneriter,
-                                                       double *         work)
+                              int *           iinr,
+                              int *           jindex,
+                              int *           jjnr,
+                              int *           shift,
+                              double *         shiftvec,
+                              double *         fshift,
+                              int *           gid,
+                              double *         pos,
+                              double *         faction,
+                              double *         charge,
+                              double *         p_facel,
+                              double *         p_krf,
+                              double *         p_crf,
+                              double *         vc,
+                              int *           type,
+                              int *           p_ntype,
+                              double *         vdwparam,
+                              double *         vvdw,
+                              double *         p_tabscale,
+                              double *         VFtab,
+                              double *         invsqrta,
+                              double *         dvda,
+                              double *         p_gbtabscale,
+                              double *         GBtab,
+                              int *           p_nthreads,
+                              int *           count,
+                              void *          mtx,
+                              int *           outeriter,
+                              int *           inneriter,
+                              double *         work)
 {
-       int           nri,ntype,nthreads,offset,tj,tj2,nti;
-       int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
-       double        facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2;
-       double        shX,shY,shZ,isai_d,dva;
+  int           nri,ntype,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
+       int                       offset,nti;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
+       int           tjA,tjB;
        gmx_gbdata_t *gbdata;
-       float *        gpol;
-
-       __m128d       ix,iy,iz,jx,jy,jz;
-       __m128d           dx,dy,dz,t1,t2,t3;
-       __m128d           fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
-       __m128d           q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
-       __m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d;
-       __m128d           xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
-       __m128d       c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix;
-       __m128d       fac,tabscale,gbtabscale;
-       __m128i       n0,nnn;
-       
-       const __m128d neg    = {-1.0,-1.0};
-       const __m128d zero   = {0.0,0.0};
-       const __m128d half   = {0.5,0.5};
-       const __m128d two    = {2.0,2.0};
-       const __m128d three  = {3.0,3.0};
-       const __m128d six    = {6.0,6.0};
-       const __m128d twelwe = {12.0,12.0};
+       double *      gpol;
+    
+       __m128d  iq,qq,jq,isai;
+       __m128d  ix,iy,iz;
+       __m128d  jx,jy,jz;
+       __m128d  dx,dy,dz;
+       __m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128d  rinv,isaj,isaprod;
+       __m128d  vcoul,fscal,gbscale,c6,c12;
+       __m128d  rinvsq,r,rtab;
+       __m128d  eps,Y,F,G,H;
+  __m128d  VV,FF,Fp;
+       __m128d  vgb,fijGB,dvdatmp;
+       __m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
+       __m128d  facel,gbtabscale,dvdaj;
+  __m128d  fijD,fijR;
+  __m128d  xmm1,tabscale,eps2;
+       __m128i  n0, nnn;
+    
        
-       const __m128i four   = _mm_set_epi32(4,4,4,4);
+       const __m128d neg        = _mm_set1_pd(-1.0);
+       const __m128d zero       = _mm_set1_pd(0.0);
+       const __m128d minushalf  = _mm_set1_pd(-0.5);
+       const __m128d two        = _mm_set1_pd(2.0);
        
        gbdata     = (gmx_gbdata_t *)work;
        gpol       = gbdata->gpol;
-       
+    
        nri        = *p_nri;
        ntype      = *p_ntype;
-       nthreads   = *p_nthreads; 
-    facel      = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));       
-       krf        = *p_krf;
-       crf        = *p_crf;
-       tabscl     = *p_tabscale;
-       gbtabscl   = *p_gbtabscale;
-       nj1        = 0;
-       
-       /* Splat variables */
-       fac        = _mm_load1_pd(&facel);
-       tabscale   = _mm_load1_pd(&tabscl);
-       gbtabscale = _mm_load1_pd(&gbtabscl);
-       
-       /* Keep compiler happy */
-       Vvdwtmp = _mm_setzero_pd();
-       Vvdwtot = _mm_setzero_pd();
-       dvdatmp = _mm_setzero_pd();
-       dvdaj   = _mm_setzero_pd();
-       isaj    = _mm_setzero_pd();
-       vcoul   = _mm_setzero_pd();
-       vgb     = _mm_setzero_pd();
-       t1      = _mm_setzero_pd();
-       t2      = _mm_setzero_pd();
-       t3      = _mm_setzero_pd();
-       xmm1    = _mm_setzero_pd();
-       xmm2    = _mm_setzero_pd();
-       xmm3    = _mm_setzero_pd();
-       xmm4    = _mm_setzero_pd();
-       jnr1    = jnr2 = 0;
-       j13     = j23  = 0;
+    
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  tabscale   = _mm_load1_pd(p_tabscale);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
+  c6          = _mm_setzero_pd();
+  c12         = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-               is3     = 3*shift[n];
-               shX     = shiftvec[is3];
-               shY     = shiftvec[is3+1];
-               shZ     = shiftvec[is3+2];
-               
-               nj0     = jindex[n];      
-        nj1     = jindex[n+1];  
-               offset  = (nj1-nj0)%2;
-               
-               ii      = iinr[n];
-               ii3     = ii*3;
-               
-               ix      = _mm_set1_pd(shX+pos[ii3+0]);
-               iy      = _mm_set1_pd(shX+pos[ii3+1]);
-               iz      = _mm_set1_pd(shX+pos[ii3+2]); 
-               q       = _mm_set1_pd(charge[ii]);
-               
-               iq      = _mm_mul_pd(fac,q); 
-               isai_d  = invsqrta[ii];
-               isai    = _mm_load1_pd(&isai_d);
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
-               nti      = 2*ntype*type[ii];
-               
-               fix     = _mm_setzero_pd();
-               fiy     = _mm_setzero_pd();
-               fiz     = _mm_setzero_pd();
-               dvdasum = _mm_setzero_pd();
-               vctot   = _mm_setzero_pd();
-               vgbtot  = _mm_setzero_pd();
-               Vvdwtot = _mm_setzero_pd();
+               ix               = _mm_set1_pd(shX+pos[ii3+0]);
+               iy               = _mm_set1_pd(shY+pos[ii3+1]);
+               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
+    
+               iq               = _mm_load1_pd(charge+ii);
+               iq               = _mm_mul_pd(iq,facel);
+    
+               isai             = _mm_load1_pd(invsqrta+ii);
+    
+               nti              = 2*ntype*type[ii];
                
-               for(k=nj0;k<nj1-offset; k+=2)
+               vctot            = _mm_setzero_pd();
+               vvdwtot          = _mm_setzero_pd();
+               vgbtot           = _mm_setzero_pd();
+               dvdasum          = _mm_setzero_pd();
+               fix              = _mm_setzero_pd();
+               fiy              = _mm_setzero_pd();
+               fiz              = _mm_setzero_pd();
+        
+               for(k=nj0;k<nj1-1; k+=2)
                {
-                       jnr1    = jjnr[k];
-                       jnr2    = jjnr[k+1];
-                       
-                       j13     = jnr1 * 3;
-                       j23     = jnr2 * 3;
-                       
-                       /* Load coordinates */
-                       xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
-                       xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
-                       
-                       xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
-                       xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
-                       
-                       /* transpose */
-                       jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
-                       
-                       /* distances */
-                       dx      = _mm_sub_pd(ix,jx);
-                       dy              = _mm_sub_pd(iy,jy);
-                       dz              = _mm_sub_pd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       /* Load invsqrta */
-                       isaj    = _mm_loadl_pd(isaj,invsqrta+jnr1);
-                       isaj    = _mm_loadh_pd(isaj,invsqrta+jnr2);
-                       isaprod = _mm_mul_pd(isai,isaj);
-                       
-                       /* Load charges */
-                       q               = _mm_loadl_pd(q,charge+jnr1);
-                       q               = _mm_loadh_pd(q,charge+jnr2);
-                       qq              = _mm_mul_pd(iq,q);
-                       
-                       vcoul   = _mm_mul_pd(qq,rinv);
-                       fscal   = _mm_mul_pd(vcoul,rinv);
-                       qq              = _mm_mul_pd(isaprod,qq);
-                       qq              = _mm_mul_pd(qq,neg);
-                       gbscale = _mm_mul_pd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       tj2     = nti+2*type[jnr2];
-                       
-                       xmm1      = _mm_loadu_pd(vdwparam+tj);
-                       xmm2     = _mm_loadu_pd(vdwparam+tj2);
-                       c6      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
-                       c12     = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-                       
-                       /* Load dvdaj */
-                       dvdaj   = _mm_loadl_pd(dvdaj, dvda+jnr1);
-                       dvdaj   = _mm_loadh_pd(dvdaj, dvda+jnr2);
-                       
-                       /* Calculate GB table index */
-                       r               = _mm_mul_pd(rsq11,rinv);
-                       rt              = _mm_mul_pd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_pd(rt,n0d);
-                       eps2    = _mm_mul_pd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G               = _mm_mul_pd(G,eps);
-                       H               = _mm_mul_pd(H,eps2);
-                       Fp              = _mm_add_pd(F,G);
-                       Fp              = _mm_add_pd(Fp,H);
-                       VV              = _mm_mul_pd(Fp,eps);
-                       VV              = _mm_add_pd(Y,VV);
-                       H               = _mm_mul_pd(two,H);
-                       FF              = _mm_add_pd(Fp,G);
-                       FF              = _mm_add_pd(FF,H);
-                       vgb             = _mm_mul_pd(qq,VV);
-                       fijC    = _mm_mul_pd(qq,FF);
-                       fijC    = _mm_mul_pd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_pd(fijC,r);
-                       dvdatmp = _mm_add_pd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_pd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_pd(dvdatmp,half);
-                       dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_pd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_pd(xmm1,isaj);
-                       dvdaj   = _mm_add_pd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       _mm_storeh_pd(dvda+jnr2,dvdaj);
-                       
-                       vctot   = _mm_add_pd(vctot,vcoul);
-                       vgbtot  = _mm_add_pd(vgbtot,vgb);
-                       
-                       /* Calculate VDW table index */
-                       rt      = _mm_mul_pd(r,tabscale);
-                       n0      = _mm_cvttpd_epi32(rt);
-                       n0d     = _mm_cvtepi32_pd(n0);
-                       eps     = _mm_sub_pd(rt,n0d);
+                       jnrA    = jjnr[k];
+                       jnrB    = jjnr[k+1];
+                       
+                       j3A     = jnrA * 3;
+                       j3B     = jnrB * 3;
+            
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+            
+                       dx           = _mm_sub_pd(ix,jx);
+                       dy           = _mm_sub_pd(iy,jy);
+                       dz           = _mm_sub_pd(iz,jz);
+            
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_pd(rinv,rinv);
+      
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+            
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+                       tjB          = nti+2*type[jnrB];
+      
+      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+                       
+                       isaprod      = _mm_mul_pd(isai,isaj);
+                       qq           = _mm_mul_pd(iq,jq);            
+                       vcoul        = _mm_mul_pd(qq,rinv);
+                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
+      vctot        = _mm_add_pd(vctot,vcoul);
+      
+      /* Polarization interaction */
+                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
+      
+                       /* Calculate GB table index */
+                       r            = _mm_mul_pd(rsq,rinv);
+                       rtab         = _mm_mul_pd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+                       
+      /* Calculate VDW table index */
+                       rtab    = _mm_mul_pd(r,tabscale);
+                       n0      = _mm_cvttpd_epi32(rtab);
+                       eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_pd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-                       /* Tabulated VdW interaction - dispersion */
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_pd(G,eps);
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -293,23 +243,18 @@ void nb_kernel430_sse2_double(int *           p_nri,
                        FF      = _mm_add_pd(Fp,G);
                        FF      = _mm_add_pd(FF,xmm1);
                        
-                       Vvdw6   = _mm_mul_pd(c6,VV);
+                       vvdw6   = _mm_mul_pd(c6,VV);
                        fijD    = _mm_mul_pd(c6,FF);
-                       
-                       /* Tabulated VdW interaction - repulsion */
-                       nnn     = _mm_add_epi32(nnn,four);
-                       
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_pd(G,eps);
+      
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -319,162 +264,138 @@ void nb_kernel430_sse2_double(int *           p_nri,
                        FF      = _mm_add_pd(Fp,G);
                        FF      = _mm_add_pd(FF,xmm1);
                        
-                       Vvdw12  = _mm_mul_pd(c12,VV);
+                       vvdw12  = _mm_mul_pd(c12,VV);
                        fijR    = _mm_mul_pd(c12,FF);
                        
-                       Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-                       
+                       vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
+                       vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
+      
                        xmm1    = _mm_add_pd(fijD,fijR);
                        xmm1    = _mm_mul_pd(xmm1,tabscale);
-                       xmm1    = _mm_add_pd(xmm1,fijC);
+                       xmm1    = _mm_add_pd(xmm1,fijGB);
                        xmm1    = _mm_sub_pd(xmm1,fscal);
                        fscal   = _mm_mul_pd(xmm1,neg);
                        fscal   = _mm_mul_pd(fscal,rinv);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_pd(fscal,dx);
-                       t2              = _mm_mul_pd(fscal,dy);
-                       t3              = _mm_mul_pd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_pd(fix,t1);
-                       fiy             = _mm_add_pd(fiy,t2);
-                       fiz             = _mm_add_pd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm1    = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
-                       xmm2    = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-                       
-                       xmm5    = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
-                       xmm6    = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-                       
-                       /* transpose */
-                       xmm7    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
-                       xmm5    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
-                       xmm6    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_pd(xmm5,t1);
-                       xmm6    = _mm_sub_pd(xmm6,t2);
-                       xmm7    = _mm_sub_pd(xmm7,t3);
-                       
-                       xmm1    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
-                       xmm2    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* store fx and fy */
-                       _mm_storeu_pd(faction+j13,xmm1);
-                       _mm_storeu_pd(faction+j23,xmm2);
-                       
-                       /* .. then fz */
-                       _mm_storel_pd(faction+j13+2,xmm7);
-                       _mm_storel_pd(faction+j23+2,xmm7);
+      
+      /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
                /* In double precision, offset can only be either 0 or 1 */
-               if(offset!=0)
+               if(k<nj1)
                {
-                       jnr1    = jjnr[k];
-                       j13             = jnr1*3;
-                       
-                       jx      = _mm_load_sd(pos+j13);
-                       jy      = _mm_load_sd(pos+j13+1);
-                       jz      = _mm_load_sd(pos+j13+2);
-                       
-                       isaj    = _mm_load_sd(invsqrta+jnr1);
-                       isaprod = _mm_mul_sd(isai,isaj);
-                       dvdaj   = _mm_load_sd(dvda+jnr1);
-                       q               = _mm_load_sd(charge+jnr1);
-                       qq      = _mm_mul_sd(iq,q);
-                       
-                       dx      = _mm_sub_sd(ix,jx);
-                       dy              = _mm_sub_sd(iy,jy);
-                       dz              = _mm_sub_sd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       vcoul   = _mm_mul_sd(qq,rinv);
-                       fscal   = _mm_mul_sd(vcoul,rinv);
-                       qq              = _mm_mul_sd(isaprod,qq);
-                       qq              = _mm_mul_sd(qq,neg);
-                       gbscale = _mm_mul_sd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       
-                       c6      = _mm_load_sd(vdwparam+tj);
-                       c12     = _mm_load_sd(vdwparam+tj+1);
-                       
-                       /* Calculate GB table index */
-                       r               = _mm_mul_sd(rsq11,rinv);
-                       rt              = _mm_mul_sd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_sd(rt,n0d);
-                       eps2    = _mm_mul_sd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); 
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); 
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); 
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); 
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
-                       
-                       G               = _mm_mul_sd(G,eps);
-                       H               = _mm_mul_sd(H,eps2);
-                       Fp              = _mm_add_sd(F,G);
-                       Fp              = _mm_add_sd(Fp,H);
-                       VV              = _mm_mul_sd(Fp,eps);
-                       VV              = _mm_add_sd(Y,VV);
-                       H               = _mm_mul_sd(two,H);
-                       FF              = _mm_add_sd(Fp,G);
-                       FF              = _mm_add_sd(FF,H);
-                       vgb             = _mm_mul_sd(qq,VV);
-                       fijC    = _mm_mul_sd(qq,FF);
-                       fijC    = _mm_mul_sd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_sd(fijC,r);
-                       dvdatmp = _mm_add_sd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_sd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_sd(dvdatmp,half);
-                       dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_sd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_sd(xmm1,isaj);
-                       dvdaj   = _mm_add_sd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       
-                       vctot   = _mm_add_sd(vctot,vcoul);
-                       vgbtot  = _mm_add_sd(vgbtot,vgb);
-                       
-                       /* Calculate VDW table index */
-                       rt      = _mm_mul_sd(r,tabscale);
-                       n0      = _mm_cvttpd_epi32(rt);
-                       n0d     = _mm_cvtepi32_pd(n0);
-                       eps     = _mm_sub_sd(rt,n0d);
+                       jnrA    = jjnr[k];
+                       j3A     = jnrA * 3;
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
+                       dx           = _mm_sub_sd(ix,jx);
+                       dy           = _mm_sub_sd(iy,jy);
+                       dz           = _mm_sub_sd(iz,jz);
+            
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_sd(rinv,rinv);
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      vvdw6        = _mm_setzero_pd();
+      vvdw12       = _mm_setzero_pd();
+
+      /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+            
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+      
+      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+                       
+                       isaprod      = _mm_mul_sd(isai,isaj);
+                       qq           = _mm_mul_sd(jq,iq);            
+                       vcoul        = _mm_mul_sd(qq,rinv);
+                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
+                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+      
+                       /* Calculate GB table index */
+                       r            = _mm_mul_sd(rsq,rinv);
+                       rtab         = _mm_mul_sd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+                       
+      /* Calculate VDW table index */
+                       rtab    = _mm_mul_sd(r,tabscale);
+                       n0      = _mm_cvttpd_epi32(rtab);
+                       eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_sd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-                       /* Tabulated VdW interaction - dispersion */
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_sd(G,eps);
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -484,23 +405,18 @@ void nb_kernel430_sse2_double(int *           p_nri,
                        FF      = _mm_add_sd(Fp,G);
                        FF      = _mm_add_sd(FF,xmm1);
                        
-                       Vvdw6   = _mm_mul_sd(c6,VV);
+                       vvdw6   = _mm_mul_sd(c6,VV);
                        fijD    = _mm_mul_sd(c6,FF);
-                       
-                       /* Tabulated VdW interaction - repulsion */
-                       nnn     = _mm_add_epi32(nnn,four);
-                       
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_sd(G,eps);
+      
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -510,261 +426,50 @@ void nb_kernel430_sse2_double(int *           p_nri,
                        FF      = _mm_add_sd(Fp,G);
                        FF      = _mm_add_sd(FF,xmm1);
                        
-                       Vvdw12  = _mm_mul_sd(c12,VV);
+                       vvdw12  = _mm_mul_sd(c12,VV);
                        fijR    = _mm_mul_sd(c12,FF);
                        
-                       Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-                       
+                       vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
+                       vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */
+            
                        xmm1    = _mm_add_sd(fijD,fijR);
                        xmm1    = _mm_mul_sd(xmm1,tabscale);
-                       xmm1    = _mm_add_sd(xmm1,fijC);
+                       xmm1    = _mm_add_sd(xmm1,fijGB);
                        xmm1    = _mm_sub_sd(xmm1,fscal);
                        fscal   = _mm_mul_sd(xmm1,neg);
                        fscal   = _mm_mul_sd(fscal,rinv);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_sd(fscal,dx);
-                       t2              = _mm_mul_sd(fscal,dy);
-                       t3              = _mm_mul_sd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_sd(fix,t1);
-                       fiy             = _mm_add_sd(fiy,t2);
-                       fiz             = _mm_add_sd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm5    = _mm_load_sd(faction+j13);   /* fx */
-                       xmm6    = _mm_load_sd(faction+j13+1); /* fy */
-                       xmm7    = _mm_load_sd(faction+j13+2); /* fz */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_sd(xmm5,t1);
-                       xmm6    = _mm_sub_sd(xmm6,t2);
-                       xmm7    = _mm_sub_sd(xmm7,t3);
-                       
-                       /* store forces */
-                       _mm_store_sd(faction+j13,xmm5);
-                       _mm_store_sd(faction+j13+1,xmm6);
-                       _mm_store_sd(faction+j13+2,xmm7);
+
+      /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-               /* fix/fiy/fiz now contain four partial terms, that all should be
-                * added to the i particle forces
-                */
-               t1               = _mm_unpacklo_pd(t1,fix);
-               t2               = _mm_unpacklo_pd(t2,fiy);
-               t3               = _mm_unpacklo_pd(t3,fiz);
-               
-               fix              = _mm_add_pd(fix,t1);
-               fiy              = _mm_add_pd(fiy,t2);
-               fiz              = _mm_add_pd(fiz,t3);
-               
-               fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
-               fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
-               fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-               
-               /* Load i forces from memory */
-               xmm1     = _mm_load_sd(faction+ii3);
-               xmm2     = _mm_load_sd(faction+ii3+1);
-               xmm3     = _mm_load_sd(faction+ii3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i forces to memory */
-               _mm_store_sd(faction+ii3,fix);
-               _mm_store_sd(faction+ii3+1,fiy);
-               _mm_store_sd(faction+ii3+2,fiz);
-               
-               /* now do dvda */
-               dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
-               dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
-               _mm_storeh_pd(&dva,dvdasum);
-               dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-               
-               ggid     = gid[n];
-               
-               /* Coulomb potential */
-               vcoul    = _mm_unpacklo_pd(vcoul,vctot);
-               vctot    = _mm_add_pd(vctot,vcoul);
-               _mm_storeh_pd(&vct,vctot);
-               Vc[ggid] = Vc[ggid] + vct;
-               
-               /* VdW potential */
-               Vvdwtmp  = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
-               Vvdwtot  = _mm_add_pd(Vvdwtot,Vvdwtmp);
-               _mm_storeh_pd(&vdwt,Vvdwtot);
-               Vvdw[ggid] = Vvdw[ggid] + vdwt;
-               
-               /* GB potential */
-               vgb      = _mm_unpacklo_pd(vgb,vgbtot);
-               vgbtot   = _mm_add_pd(vgbtot,vgb);
-               _mm_storeh_pd(&vgbt,vgbtot);
-               gpol[ggid] = gpol[ggid] + vgbt;
-       }
-       
-       *outeriter   = nri;            
-    *inneriter   = nj1;        
-}
-
-
-/*
- * Gromacs nonbonded kernel nb_kernel430nf
- * Coulomb interaction:     Generalized-Born
- * VdW interaction:         Tabulated
- * water optimization:      No
- * Calculate forces:        no
- */
-void nb_kernel430nf_sse2_double(
-                    int *           p_nri,
-                    int *           iinr,
-                    int *           jindex,
-                    int *           jjnr,
-                    int *           shift,
-                    double *         shiftvec,
-                    double *         fshift,
-                    int *           gid,
-                    double *         pos,
-                    double *         faction,
-                    double *         charge,
-                    double *         p_facel,
-                    double *         p_krf,
-                    double *         p_crf,
-                    double *         Vc,
-                    int *           type,
-                    int *           p_ntype,
-                    double *         vdwparam,
-                    double *         Vvdw,
-                    double *         p_tabscale,
-                    double *         VFtab,
-                    double *         invsqrta,
-                    double *         dvda,
-                    double *         p_gbtabscale,
-                    double *         GBtab,
-                    int *           p_nthreads,
-                    int *           count,
-                    void *          mtx,
-                    int *           outeriter,
-                    int *           inneriter,
-                    double *         work)
-{
-    int           nri,ntype,nthreads;
-    double         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
-    int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
-    double         shX,shY,shZ;
-    double         iq;
-    double         qq,vcoul,vctot;
-    int           nti;
-    int           tj;
-    double         Vvdw6,Vvdwtot;
-    double         Vvdw12;
-    double         r,rt,eps,eps2;
-    int           n0,nnn;
-    double         Y,F,Geps,Heps2,Fp,VV;
-    double         isai,isaj,isaprod,gbscale;
-    double         ix1,iy1,iz1;
-    double         jx1,jy1,jz1;
-    double         dx11,dy11,dz11,rsq11,rinv11;
-    double         c6,c12;
-
-    nri              = *p_nri;         
-    ntype            = *p_ntype;       
-    nthreads         = *p_nthreads;    
-    facel            = *p_facel;       
-    krf              = *p_krf;         
-    crf              = *p_crf;         
-    tabscale         = *p_tabscale;    
-    gbtabscale       = *p_gbtabscale;  
-    nj1              = 0;              
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
     
-    for(n=0; (n<nri); n++)
-    {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
-        ix1              = shX + pos[ii3+0];
-        iy1              = shY + pos[ii3+1];
-        iz1              = shZ + pos[ii3+2];
-        iq               = facel*charge[ii];
-        isai             = invsqrta[ii];   
-        nti              = 2*ntype*type[ii];
-        vctot            = 0;              
-        Vvdwtot          = 0;              
-        
-        for(k=nj0; (k<nj1); k++)
-        {
-            jnr              = jjnr[k];        
-            j3               = 3*jnr;          
-            jx1              = pos[j3+0];      
-            jy1              = pos[j3+1];      
-            jz1              = pos[j3+2];      
-            dx11             = ix1 - jx1;      
-            dy11             = iy1 - jy1;      
-            dz11             = iz1 - jz1;      
-            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
-            rinv11           = gmx_invsqrt(rsq11);
-            isaj             = invsqrta[jnr];  
-            isaprod          = isai*isaj;      
-            qq               = iq*charge[jnr]; 
-            vcoul            = qq*rinv11;      
-            qq               = isaprod*(-qq);  
-            gbscale          = isaprod*gbtabscale;
-            tj               = nti+2*type[jnr];
-            c6               = vdwparam[tj];   
-            c12              = vdwparam[tj+1]; 
-            r                = rsq11*rinv11;   
-            rt               = r*gbscale;      
-            n0               = rt;             
-            eps              = rt-n0;          
-            eps2             = eps*eps;        
-            nnn              = 4*n0;           
-            Y                = GBtab[nnn];     
-            F                = GBtab[nnn+1];   
-            Geps             = eps*GBtab[nnn+2];
-            Heps2            = eps2*GBtab[nnn+3];
-            Fp               = F+Geps+Heps2;   
-            VV               = Y+eps*Fp;       
-            vgb              = qq*VV;          
-            vctot            = vctot + vcoul;  
-            r                = rsq11*rinv11;   
-            rt               = r*tabscale;     
-            n0               = rt;             
-            eps              = rt-n0;          
-            eps2             = eps*eps;        
-            nnn              = 8*n0;           
-            Y                = VFtab[nnn];     
-            F                = VFtab[nnn+1];   
-            Geps             = eps*VFtab[nnn+2];
-            Heps2            = eps2*VFtab[nnn+3];
-            Fp               = F+Geps+Heps2;   
-            VV               = Y+eps*Fp;       
-            Vvdw6            = c6*VV;          
-            nnn              = nnn+4;          
-            Y                = VFtab[nnn];     
-            F                = VFtab[nnn+1];   
-            Geps             = eps*VFtab[nnn+2];
-            Heps2            = eps2*VFtab[nnn+3];
-            Fp               = F+Geps+Heps2;   
-            VV               = Y+eps*Fp;       
-            Vvdw12           = c12*VV;         
-            Vvdwtot          = Vvdwtot+ Vvdw6 + Vvdw12;
-        }
-        
-        ggid             = gid[n];         
-        Vc[ggid]         = Vc[ggid] + vctot;
-        Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
-    }
+    ggid     = gid[n];         
+    
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
     
-    *outeriter       = nri;            
-    *inneriter       = nj1;            
+       }
+  
+       *outeriter   = nri;            
+  *inneriter   = nj1;  
 }
 
-
index 72b318e5da7623f657aaa3e456690538fabcf86c..26aa82f8d8354bbde19552ec3b4a86374d504aa6 100644 (file)
@@ -26,9 +26,6 @@
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-#include "nb_kernel400_sse2_single.h"
-
-
 
 void nb_kernel400_sse2_single(int *           p_nri,
                     int *           iinr,
@@ -144,7 +141,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
                fix              = _mm_setzero_ps();
                fiy              = _mm_setzero_ps();
                fiz              = _mm_setzero_ps();
-       
+
         for(k=nj0; k<nj1-7; k+=8)
                {
                        jnrA        = jjnr[k];   
@@ -213,7 +210,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
                        rB           = _mm_mul_ps(rsqB,rinvB);
                        rtab         = _mm_mul_ps(r,gbscale);
                        rtabB        = _mm_mul_ps(rB,gbscaleB);
-                       
+
                        n0           = _mm_cvttps_epi32(rtab);
                        n0B          = _mm_cvttps_epi32(rtabB);
             eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
@@ -248,8 +245,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
             FB      = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
             vgbB    = _mm_mul_ps(YB, qqB);           
             fijGBB  = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
-            
-            
+           
             dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
             dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
 
@@ -333,7 +329,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
                        /* Calculate GB table index */
                        r            = _mm_mul_ps(rsq,rinv);
                        rtab         = _mm_mul_ps(r,gbscale);
-                       
+
                        n0           = _mm_cvttps_epi32(rtab);
             eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
                        nnn          = _mm_slli_epi32(n0,2);
@@ -352,7 +348,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
             F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
             vgb     = _mm_mul_ps(Y, qq);           
             fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
-                        
+   
             dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
             
             vgbtot  = _mm_add_ps(vgbtot, vgb);
@@ -448,7 +444,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
                        /* Calculate GB table index */
                        r            = _mm_mul_ps(rsq,rinv);
                        rtab         = _mm_mul_ps(r,gbscale);
-                       
+
                        n0           = _mm_cvttps_epi32(rtab);
             eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
                        nnn          = _mm_slli_epi32(n0,2);
@@ -467,7 +463,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
             F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
             vgb     = _mm_mul_ps(Y, qq);           
             fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
-            
+
             dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);            
             vgbtot  = _mm_add_ps(vgbtot, vgb);
             
@@ -531,7 +527,7 @@ void nb_kernel400_sse2_single(int *           p_nri,
  * water optimization:      No
  * Calculate forces:        no
  */
-void nb_kernel400nf_sse2_single(
+void nb_kernel400nf_x86_64_sse(
                                int *           p_nri,
                                int *           iinr,
                                int *           jindex,
index b55fc2e0a6a75cad60f110dacf4cc50d0bd97f11..edeb677eb61538d69ca595390df212d6d1ba10bd 100644 (file)
@@ -26,9 +26,6 @@
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-#include "nb_kernel410_sse2_single.h"
-
-
 
 void nb_kernel410_sse2_single(int *           p_nri,
                     int *           iinr,
@@ -101,12 +98,12 @@ void nb_kernel410_sse2_single(int *           p_nri,
        __m128i  n0, nnn;
        __m128i  n0B, nnnB;
        
-       const __m128 neg        = {-1.0f,-1.0f,-1.0f,-1.0f};
-       const __m128 zero       = {0.0f,0.0f,0.0f,0.0f};
-       const __m128 minushalf  = {-0.5f,-0.5f,-0.5f,-0.5f};
-       const __m128 two        = {2.0f,2.0f,2.0f,2.0f};
-       const __m128 six        = {6.0f,6.0f,6.0f,6.0f};
-       const __m128 twelve     = {12.0f,12.0f,12.0f,12.0f};  
+       const __m128 neg        = _mm_set1_ps(-1.0f);
+       const __m128 zero       = _mm_set1_ps(0.0f);
+       const __m128 minushalf  = _mm_set1_ps(-0.5f);
+       const __m128 two        = _mm_set1_ps(2.0f);
+       const __m128 six        = _mm_set1_ps(6.0f);
+       const __m128 twelve     = _mm_set1_ps(12.0f);
 
        gbdata          = (gmx_gbdata_t *)work;
        gpol            = gbdata->gpol;
@@ -620,7 +617,7 @@ void nb_kernel410_sse2_single(int *           p_nri,
  * water optimization:      No
  * Calculate forces:        no
  */
-void nb_kernel410nf_sse2_single(
+void nb_kernel410nf_x86_64_sse(
                     int *           p_nri,
                     int *           iinr,
                     int *           jindex,
index 307f9726a2c91a24db18e85396a0b5c66f9dcdff..10646126956901876c4cf96fadde6720798272ee 100644 (file)
 
 #include <xmmintrin.h>
 #include <emmintrin.h>
+
 #include <gmx_sse2_single.h>
 
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-#include "nb_kernel430_sse2_single.h"
-
-/* to extract single integers from a __m128i datatype */
-#define _mm_extract_epi32(x, imm) \
-_mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
 
 void nb_kernel430_sse2_single(int *           p_nri,
                                                   int *           iinr,
@@ -91,15 +87,15 @@ void nb_kernel430_sse2_single(int *           p_nri,
        __m128   fac_sse,tabscale_sse,gbtabscale_sse;
        
        __m128i  n0, nnn;
-       const __m128 neg    = {-1.0f,-1.0f,-1.0f,-1.0f};
-       const __m128 zero   = {0.0f,0.0f,0.0f,0.0f};
-       const __m128 half   = {0.5f,0.5f,0.5f,0.5f};
-       const __m128 two    = {2.0f,2.0f,2.0f,2.0f};
-       const __m128 three  = {3.0f,3.0f,3.0f,3.0f};
-       const __m128 six    = {6.0f,6.0f,6.0f,6.0f};
-       const __m128 twelwe = {12.0f,12.0f,12.0f,12.0f};
+       const __m128 neg    = _mm_set1_ps(-1.0f);
+       const __m128 zero   = _mm_set1_ps(0.0f);
+    const __m128 half   = _mm_set1_ps(0.5f);
+       const __m128 two    = _mm_set1_ps(2.0f);
+       const __m128 three  = _mm_set1_ps(3.0f);
+       const __m128 six    = _mm_set1_ps(6.0f);
+    const __m128 twelwe = _mm_set1_ps(12.0f);
        
-       __m128i four        = _mm_set_epi32(4,4,4,4); 
+       __m128i four        = _mm_set1_epi32(4);
        __m128i maski       = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);     
        __m128i mask        = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);   
        
@@ -109,14 +105,14 @@ void nb_kernel430_sse2_single(int *           p_nri,
        gpol            = gbdata->gpol;
                
        nri              = *p_nri;         
-  ntype            = *p_ntype;       
-  nthreads         = *p_nthreads;    
-  facel            = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));       
-  krf              = *p_krf;         
-  crf              = *p_crf;         
-  tabscale         = *p_tabscale;    
-  gbtabscale       = *p_gbtabscale;  
-  nj1              = 0;
+    ntype            = *p_ntype;       
+    nthreads         = *p_nthreads;    
+    facel            = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));       
+    krf              = *p_krf;         
+    crf              = *p_crf;         
+    tabscale         = *p_tabscale;    
+    gbtabscale       = *p_gbtabscale;  
+    nj1              = 0;
 
        /* Splat variables */
        fac_sse        = _mm_load1_ps(&facel);
@@ -282,10 +278,10 @@ void nb_kernel430_sse2_single(int *           p_nri,
                        nnn     = _mm_slli_epi32(n0,2);
                
                        /* the tables are 16-byte aligned, so we can use _mm_load_ps */                 
-                       xmm1    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-                       xmm2    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-                       xmm3    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-                       xmm4    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
                        
                        /* transpose 4*4 */
                        xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -345,10 +341,10 @@ void nb_kernel430_sse2_single(int *           p_nri,
                        nnn     = _mm_slli_epi32(n0,3);
 
                        /* Tabulated VdW interaction - disperion */                     
-                       xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-                       xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-                       xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-                       xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
                        
                        /* transpose 4*4 */
                        xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -377,10 +373,10 @@ void nb_kernel430_sse2_single(int *           p_nri,
                        /* Tabulated VdW interaction - repulsion */
                        nnn     = _mm_add_epi32(nnn,four);
                        
-                       xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-                       xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-                       xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-                       xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
                        
                        /* transpose 4*4 */
                        xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -654,10 +650,10 @@ void nb_kernel430_sse2_single(int *           p_nri,
                        nnn     = _mm_slli_epi32(n0,2);
                        
                        /* the tables are 16-byte aligned, so we can use _mm_load_ps */                 
-                       xmm1    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-                       xmm2    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-                       xmm3    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-                       xmm4    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
                        
                        /* transpose 4*4 */
                        xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -710,10 +706,10 @@ void nb_kernel430_sse2_single(int *           p_nri,
                        nnn     = _mm_slli_epi32(n0,3);
                        
                        /* Tabulated VdW interaction - disperion */     
-                       xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-                       xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-                       xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-                       xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
                
                        /* transpose 4*4 */
                        xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -742,10 +738,10 @@ void nb_kernel430_sse2_single(int *           p_nri,
                        /* Tabulated VdW interaction - repulsion */
                        nnn     = _mm_add_epi32(nnn,four);
                                        
-                       xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-                       xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-                       xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-                       xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
                        
                        /* transpose 4*4 */
                        xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -985,7 +981,7 @@ void nb_kernel430_sse2_single(int *           p_nri,
  * water optimization:      No
  * Calculate forces:        no
  */
-void nb_kernel430nf_sse2_single(
+void nb_kernel430nf_x86_64_sse(
                     int *           p_nri,
                     int *           iinr,
                     int *           jindex,
@@ -1077,7 +1073,7 @@ void nb_kernel430nf_sse2_single(
             dy11             = iy1 - jy1;      
             dz11             = iz1 - jz1;      
             rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
-            rinv11           = gmx_mm_invsqrt(rsq11);
+            rinv11           = gmx_invsqrt(rsq11);
             isaj             = invsqrta[jnr];  
             isaprod          = isai*isaj;      
             qq               = iq*charge[jnr]; 
index badc0c9b26b456d2625a496e9aff9b01aa34d957..a420fffe4c27cc657a56da942ca49e476dae7e68 100644 (file)
@@ -61,11 +61,11 @@ void nb_kernel400_x86_64_sse2(int *           p_nri,
                               int *           inneriter,
                               double *         work)
 {
-    int           nri,nthreads;
-    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
+  int           nri,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
        gmx_gbdata_t *gbdata;
        double *      gpol;
     
@@ -93,35 +93,35 @@ void nb_kernel400_x86_64_sse2(int *           p_nri,
     
        nri        = *p_nri;
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
-    
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
                ix               = _mm_set1_pd(shX+pos[ii3+0]);
                iy               = _mm_set1_pd(shY+pos[ii3+1]);
                iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
+    
                iq               = _mm_load1_pd(charge+ii);
                iq               = _mm_mul_pd(iq,facel);
-        
+    
                isai             = _mm_load1_pd(invsqrta+ii);
                        
                vctot            = _mm_setzero_pd();
@@ -138,18 +138,18 @@ void nb_kernel400_x86_64_sse2(int *           p_nri,
                        
                        j3A     = jnrA * 3;
                        j3B     = jnrB * 3;
-            
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+      
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
             
                        dx           = _mm_sub_pd(ix,jx);
                        dy           = _mm_sub_pd(iy,jy);
                        dz           = _mm_sub_pd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_pd(rinv,rinv);
-            
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
@@ -160,7 +160,7 @@ void nb_kernel400_x86_64_sse2(int *           p_nri,
                        qq           = _mm_mul_pd(iq,jq);            
                        vcoul        = _mm_mul_pd(qq,rinv);
                        fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
+      vctot        = _mm_add_pd(vctot,vcoul);
             
             /* Polarization interaction */
                        qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
@@ -174,48 +174,48 @@ void nb_kernel400_x86_64_sse2(int *           p_nri,
                        eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-                                   
-            fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
-            
-            /***********************************/
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+      
+      fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
@@ -224,96 +224,109 @@ void nb_kernel400_x86_64_sse2(int *           p_nri,
                {
                        jnrA    = jjnr[k];
                        j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-            
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
                        dx           = _mm_sub_sd(ix,jx);
                        dy           = _mm_sub_sd(iy,jy);
                        dz           = _mm_sub_sd(iz,jz);
-            
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_sd(rinv,rinv);
-            
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      vgb          = _mm_setzero_pd();
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
                        GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-                               
+      
                        isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
-                       vcoul        = _mm_mul_sd(qq,rinv);
-                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      /* Since we need _mm_add_pd below, the order here og jq,iq becomes important */
+                       qq           = _mm_mul_sd(jq,iq);  
+      vcoul        = _mm_mul_sd(qq,rinv);
+      fscal        = _mm_mul_sd(vcoul,rinv);                                 
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
                        gbscale      = _mm_mul_sd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_sd(rsq,rinv);
                        rtab         = _mm_mul_sd(r,gbscale);
-
+      
                        n0                   = _mm_cvttpd_epi32(rtab);
                        eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
 
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
                        
-            fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
-            
-            /***********************************/
+      fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-        
-        ggid     = gid[n];         
-        
-        gmx_mm_update_1pot_pd(vctot,vc+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
-       }
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
     
+    ggid     = gid[n];         
+   
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+  }
+  
        *outeriter   = nri;            
-    *inneriter   = nj1;        
+  *inneriter   = nj1;  
 }
index c1d89282396b600839704cc511fbfb711b283dbb..60706bcea5c7f5e8da85af66f0673f996c5a7677 100644 (file)
@@ -62,16 +62,16 @@ void nb_kernel410_x86_64_sse2(int *           p_nri,
                                                        int *           inneriter,
                                                        double *         work)
 {
-    int           nri,ntype,nthreads;
-    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
+  int           nri,ntype,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
        int                       offset,nti;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
        int           tjA,tjB;
        gmx_gbdata_t *gbdata;
        double *      gpol;
-    
        __m128d  iq,qq,jq,isai;
        __m128d  ix,iy,iz;
        __m128d  jx,jy,jz;
@@ -100,37 +100,37 @@ void nb_kernel410_x86_64_sse2(int *           p_nri,
        nri        = *p_nri;
        ntype      = *p_ntype;
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
-
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
-    c6          = _mm_setzero_pd();
-    c12         = _mm_setzero_pd();
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
+  c6          = _mm_setzero_pd();
+  c12         = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
                ix               = _mm_set1_pd(shX+pos[ii3+0]);
                iy               = _mm_set1_pd(shY+pos[ii3+1]);
                iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
+    
                iq               = _mm_load1_pd(charge+ii);
                iq               = _mm_mul_pd(iq,facel);
-        
+    
                isai             = _mm_load1_pd(invsqrta+ii);
         
                nti              = 2*ntype*type[ii];
@@ -151,39 +151,39 @@ void nb_kernel410_x86_64_sse2(int *           p_nri,
                        j3A     = jnrA * 3;
                        j3B     = jnrB * 3;
 
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
 
                        dx           = _mm_sub_pd(ix,jx);
                        dy           = _mm_sub_pd(iy,jy);
                        dz           = _mm_sub_pd(iz,jz);
 
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_pd(rinv,rinv);
-            
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
                        GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
             
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
                        tjB          = nti+2*type[jnrB];
-            
-            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+      
+      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
                        
                        isaprod      = _mm_mul_pd(isai,isaj);
                        qq           = _mm_mul_pd(iq,jq);            
                        vcoul        = _mm_mul_pd(qq,rinv);
                        fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
+      vctot        = _mm_add_pd(vctot,vcoul);
             
-            /* Polarization interaction */
+      /* Polarization interaction */
                        qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
                        gbscale      = _mm_mul_pd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_pd(rsq,rinv);
                        rtab         = _mm_mul_pd(r,gbscale);
@@ -192,30 +192,30 @@ void nb_kernel410_x86_64_sse2(int *           p_nri,
                        eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
                        
                        rinvsix      = _mm_mul_pd(rinvsq,rinvsq);
                        rinvsix      = _mm_mul_pd(rinvsix,rinvsq);
@@ -224,26 +224,26 @@ void nb_kernel410_x86_64_sse2(int *           p_nri,
                        vvdw12       = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
                        vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
             
-            fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
-                                                 _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
-                                                            _mm_mul_pd(six,vvdw6))),
-                                      _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
-                        
-            /***********************************/
+      fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
+                                           _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+                                                      _mm_mul_pd(six,vvdw6))),
+                                _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
@@ -253,39 +253,53 @@ void nb_kernel410_x86_64_sse2(int *           p_nri,
                        jnrA    = jjnr[k];
                        
                        j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
             
                        dx           = _mm_sub_sd(ix,jx);
                        dy           = _mm_sub_sd(iy,jy);
                        dz           = _mm_sub_sd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_sd(rinv,rinv);
-            
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      vvdw6        = _mm_setzero_pd();
+      vvdw12       = _mm_setzero_pd();
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
                        GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-            
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
-            
-            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+      
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+      
+      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
                        
                        isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
+                       qq           = _mm_mul_sd(jq,iq);            
                        vcoul        = _mm_mul_sd(qq,rinv);
                        fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
                        gbscale      = _mm_mul_sd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_sd(rsq,rinv);
                        rtab         = _mm_mul_sd(r,gbscale);
@@ -294,70 +308,73 @@ void nb_kernel410_x86_64_sse2(int *           p_nri,
                        eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
                        
                        rinvsix      = _mm_mul_sd(rinvsq,rinvsq);
                        rinvsix      = _mm_mul_sd(rinvsix,rinvsq);
                        
                        vvdw6        = _mm_mul_sd(c6,rinvsix);
                        vvdw12       = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
-                       vvdwtot      = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
-        
-            fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
-                                                 _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
-                                                            _mm_mul_sd(six,vvdw6))),
-                                      _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
-            
-            /***********************************/
+                       vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6)); /* (**) */
+      
+      fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
+                                           _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+                                                      _mm_mul_sd(six,vvdw6))),
+                                _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-
-        ggid     = gid[n];         
-        
-        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+    
+    ggid     = gid[n];         
+    
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+    
        }
-
+  
        *outeriter   = nri;            
-    *inneriter   = nj1;        
+  *inneriter   = nj1;  
 }
index f2335aa1a4308e511ffccbfc296118c836efbe9c..895c383b90bdf106c00f5af0cf15685f8680f094 100644 (file)
@@ -60,12 +60,12 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                               int *           inneriter,
                               double *         work)
 {
-    int           nri,ntype,nthreads;
-    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
+  int           nri,ntype,nthreads;
+  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+  double        shX,shY,shZ;
        int                       offset,nti;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
+  int           jnrA,jnrB;
+  int           j3A,j3B;
        int           tjA,tjB;
        gmx_gbdata_t *gbdata;
        double *      gpol;
@@ -80,12 +80,12 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
        __m128d  vcoul,fscal,gbscale,c6,c12;
        __m128d  rinvsq,r,rtab;
        __m128d  eps,Y,F,G,H;
-    __m128d  VV,FF,Fp;
+  __m128d  VV,FF,Fp;
        __m128d  vgb,fijGB,dvdatmp;
        __m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
        __m128d  facel,gbtabscale,dvdaj;
-    __m128d  fijD,fijR;
-    __m128d  xmm1,tabscale,eps2;
+  __m128d  fijD,fijR;
+  __m128d  xmm1,tabscale,eps2;
        __m128i  n0, nnn;
     
        
@@ -100,40 +100,40 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
        nri        = *p_nri;
        ntype      = *p_ntype;
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
-    tabscale   = _mm_load1_pd(p_tabscale);
-    
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
-    c6          = _mm_setzero_pd();
-    c12         = _mm_setzero_pd();
+  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+  gbtabscale = _mm_load1_pd(p_gbtabscale);  
+  facel      = _mm_load1_pd(p_facel);
+  tabscale   = _mm_load1_pd(p_tabscale);
+  
+  nj1         = 0;
+  jnrA = jnrB = 0;
+  j3A = j3B   = 0;
+  jx          = _mm_setzero_pd();
+  jy          = _mm_setzero_pd();
+  jz          = _mm_setzero_pd();
+  c6          = _mm_setzero_pd();
+  c12         = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-        is3              = 3*shift[n];     
-        shX              = shiftvec[is3];  
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];      
-        nj1              = jindex[n+1];    
-        ii               = iinr[n];        
-        ii3              = 3*ii;           
+    is3              = 3*shift[n];     
+    shX              = shiftvec[is3];  
+    shY              = shiftvec[is3+1];
+    shZ              = shiftvec[is3+2];
+    nj0              = jindex[n];      
+    nj1              = jindex[n+1];    
+    ii               = iinr[n];        
+    ii3              = 3*ii;           
                
                ix               = _mm_set1_pd(shX+pos[ii3+0]);
                iy               = _mm_set1_pd(shY+pos[ii3+1]);
                iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
+    
                iq               = _mm_load1_pd(charge+ii);
                iq               = _mm_mul_pd(iq,facel);
-        
+    
                isai             = _mm_load1_pd(invsqrta+ii);
-        
+    
                nti              = 2*ntype*type[ii];
                
                vctot            = _mm_setzero_pd();
@@ -152,39 +152,39 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        j3A     = jnrA * 3;
                        j3B     = jnrB * 3;
             
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
             
                        dx           = _mm_sub_pd(ix,jx);
                        dy           = _mm_sub_pd(iy,jy);
                        dz           = _mm_sub_pd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_pd(rinv,rinv);
-            
+      
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
                        GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
             
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
                        tjB          = nti+2*type[jnrB];
-            
-            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+      
+      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
                        
                        isaprod      = _mm_mul_pd(isai,isaj);
                        qq           = _mm_mul_pd(iq,jq);            
                        vcoul        = _mm_mul_pd(qq,rinv);
                        fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      vctot        = _mm_add_pd(vctot,vcoul);
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
                        gbscale      = _mm_mul_pd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_pd(rsq,rinv);
                        rtab         = _mm_mul_pd(r,gbscale);
@@ -193,47 +193,47 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
+      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+      vgb     = _mm_mul_pd(Y, qq);           
+      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb);
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
                        
-            /* Calculate VDW table index */
+      /* Calculate VDW table index */
                        rtab    = _mm_mul_pd(r,tabscale);
                        n0      = _mm_cvttpd_epi32(rtab);
                        eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_pd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -245,16 +245,16 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        
                        vvdw6   = _mm_mul_pd(c6,VV);
                        fijD    = _mm_mul_pd(c6,FF);
-            
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
-            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
-            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
+      
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -269,29 +269,29 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        
                        vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
                        vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
-            
+      
                        xmm1    = _mm_add_pd(fijD,fijR);
                        xmm1    = _mm_mul_pd(xmm1,tabscale);
                        xmm1    = _mm_add_pd(xmm1,fijGB);
                        xmm1    = _mm_sub_pd(xmm1,fscal);
                        fscal   = _mm_mul_pd(xmm1,neg);
                        fscal   = _mm_mul_pd(fscal,rinv);
-            
-            /***********************************/
+      
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_pd(fscal,dx);
+      ty           = _mm_mul_pd(fscal,dy);
+      tz           = _mm_mul_pd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_pd(fix,tx);
+      fiy          = _mm_add_pd(fiy,ty);
+      fiz          = _mm_add_pd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
@@ -300,39 +300,53 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                {
                        jnrA    = jjnr[k];
                        j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-            
+      
+      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+      
                        dx           = _mm_sub_sd(ix,jx);
                        dy           = _mm_sub_sd(iy,jy);
                        dz           = _mm_sub_sd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
+      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+      
+      rinv         = gmx_mm_invsqrt_pd(rsq);
                        rinvsq       = _mm_mul_sd(rinv,rinv);
-            
-                       /***********************************/
+      
+      /* These reason for zeroing these variables here is for fixing bug 585
+       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
+       * and r1=0, but it should be r1=a[1]. 
+       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
+       * To work around it, we zero these variables and use _mm_add_pd (**) instead
+       * Note that the only variables that get affected are the energies since
+       * the total sum needs to be correct 
+       */
+      vgb          = _mm_setzero_pd();
+      vcoul        = _mm_setzero_pd();
+      dvdatmp      = _mm_setzero_pd();
+      vvdw6        = _mm_setzero_pd();
+      vvdw12       = _mm_setzero_pd();
+
+      /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
                        GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
                        GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
             
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
-            
-            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+      /* Lennard-Jones */
+      tjA          = nti+2*type[jnrA];
+      
+      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
                        
                        isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
+                       qq           = _mm_mul_sd(jq,iq);            
                        vcoul        = _mm_mul_sd(qq,rinv);
                        fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
-            
-            /* Polarization interaction */
+      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
+      
+      /* Polarization interaction */
                        qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
                        gbscale      = _mm_mul_sd(isaprod,gbtabscale);
-            
+      
                        /* Calculate GB table index */
                        r            = _mm_mul_sd(rsq,rinv);
                        rtab         = _mm_mul_sd(r,gbscale);
@@ -341,47 +355,47 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        nnn                  = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+      /* the tables are 16-byte aligned, so we can use _mm_load_pd */                  
+      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
+      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+      vgb     = _mm_mul_sd(Y, qq);           
+      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+      
+      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+      
+      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
+      
+      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
+      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+      
+      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
                        
-            /* Calculate VDW table index */
+      /* Calculate VDW table index */
                        rtab    = _mm_mul_sd(r,tabscale);
                        n0      = _mm_cvttpd_epi32(rtab);
                        eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_sd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -393,16 +407,16 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        
                        vvdw6   = _mm_mul_sd(c6,VV);
                        fijD    = _mm_mul_sd(c6,FF);
-            
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
+      
+      /* Dispersion */
+      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+      F            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(Y,F);
+      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+      H            = _mm_setzero_pd();
+      GMX_MM_TRANSPOSE2_PD(G,H);
+      
+      G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -416,7 +430,7 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        fijR    = _mm_mul_sd(c12,FF);
                        
                        vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
-                       vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
+                       vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */
             
                        xmm1    = _mm_add_sd(fijD,fijR);
                        xmm1    = _mm_mul_sd(xmm1,tabscale);
@@ -425,34 +439,37 @@ void nb_kernel430_x86_64_sse2(int *           p_nri,
                        fscal   = _mm_mul_sd(xmm1,neg);
                        fscal   = _mm_mul_sd(fscal,rinv);
 
-            /***********************************/
+      /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
-            
-            /* Store j forces back */
+      
+      /* Calculate temporary vectorial force */
+      tx           = _mm_mul_sd(fscal,dx);
+      ty           = _mm_mul_sd(fscal,dy);
+      tz           = _mm_mul_sd(fscal,dz);
+      
+      /* Increment i atom force */
+      fix          = _mm_add_sd(fix,tx);
+      fiy          = _mm_add_sd(fiy,ty);
+      fiz          = _mm_add_sd(fiz,tz);
+      
+      /* Store j forces back */
                        GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
-        
-        ggid     = gid[n];         
-        
-        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
-       }
+    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
     
+    ggid     = gid[n];         
+    
+    gmx_mm_update_1pot_pd(vctot,vc+ggid);
+    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
+    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
+    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
+    
+       }
+  
        *outeriter   = nri;            
-    *inneriter   = nj1;        
+  *inneriter   = nj1;  
 }
 
index 2c3d8d43687cd340b77262dbc05d136f170baf0f..e43d28db6731968d656802416ff590fd995a2098 100644 (file)
@@ -715,7 +715,7 @@ nb_kernel_allvsallgb_sse2_double(t_forcerec *           fr,
         pmask1           = prologue_mask[i+1];
         emask0           = epilogue_mask[i];
         emask1           = epilogue_mask[i+1];
-        imask_SSE0       = _mm_load1_pd((double *)(imask+i));
+        imask_SSE0       = _mm_load1_pd((double *)(imask+2*i));
         imask_SSE1       = _mm_load1_pd((double *)(imask+2*i+2));
         
         for(j=nj0; j<nj1; j+=UNROLLJ)
index c26c887ee57fbfb4416f117ec6cc35f0d0ab883b..3236d83b66f5e80817946ddbbb78d2b4263a2234 100644 (file)
@@ -42,21 +42,16 @@ set(MDRUN_SOURCES
 add_library(gmxpreprocess ${GMXPREPROCESS_SOURCES})
 target_link_libraries(gmxpreprocess md)
 set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
-set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
 
 
 if(GMX_OPENMM) 
     add_subdirectory(gmx_gpu_utils)
     include_directories(./gmx_gpu_utils ${OpenMM_INCLUDE_DIR})
     link_directories(${OpenMM_LIBRARY_DIR}) 
-    # only define if this is a local build not a release 
-    # we assume that the auto-generated version is not used && 
-    # version string does not contain "-dev" => it's a release build
-    if(NOT USE_VERSION_H AND NOT PROJECT_VERSION MATCHES ".*-dev.*")  
-        add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" ) 
-    else()
-        add_definitions( -DOPENMM_PLUGIN_DIR="" )
-    endif()
+    # with this define no evn.var. is needed with OPENMM_PLUGIN_DIR
+    # if the same OpenMM installation is used for running and building 
+    add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" ) 
+    file(TO_CMAKE_PATH ${OpenMM_PLUGIN_DIR} _path)
     add_library(openmm_api_wrapper STATIC openmm_wrapper.cpp)
     target_link_libraries(openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})
     set(GMX_OPENMM_LIBRARIES openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})   
@@ -112,6 +107,8 @@ if(GMX_OPENMM AND MSVC)
 endif()
 
 
+install(TARGETS gmxpreprocess DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
+install(TARGETS mdrun DESTINATION ${BIN_INSTALL_DIR} COMPONENT mdrun)
 install(TARGETS 
         grompp
         tpbconv
@@ -121,57 +118,31 @@ install(TARGETS
         gmxdump
         g_x2top
         gmxcheck
-        mdrun
-       gmxpreprocess DESTINATION ${LIB_INSTALL_DIR}
+        COMPONENT runtime
         RUNTIME DESTINATION ${BIN_INSTALL_DIR})
 
-
-# if we build shared gromacs libs, when installing throught the install-mdrun target 
-# these libs need to be installed as well
-if(BUILD_SHARED_LIBS)
-    # in MDRUN_LIBS we store the libraries MDRUN links against (NOTE: hardcoded!!!)
-    set(MDRUN_LIBS gmxpreprocess md gmx)
-    
-    # generate install-libXXX custom target for each shared lib that mdrun links against
-    foreach(_lib ${MDRUN_LIBS})
-        # double-check that the type is SHARED
-        get_target_property(_type ${_lib} TYPE)
-        if(NOT ${_type} STREQUAL "SHARED_LIBRARY")
-            message(FATAL_ERROR " Internal error: library ${_lib} is not shared so it's not supposed to be processed for installing")
-        endif()
-
-        # figure out the path and filename under which the lib will be installed
-        # (libname with pre- and suffix)
-        get_target_property(_lib_path ${_lib} LOCATION)
-        string(REGEX REPLACE "/" ";" _lib_fname ${_lib_path})
-        list(REVERSE _lib_fname)
-        list(GET _lib_fname 0 _lib_fname)
-
-        # create custom target for copying each library to the install location 
-        # TODO: need to fix this to have the .so.6 form
-        add_custom_target(install-${_lib}
-            COMMAND ${CMAKE_COMMAND} -E copy 
-                "${_lib_path}" "${LIB_INSTALL_DIR}/${_lib_fname}.${SOVERSION}"
-            COMMAND ${CMAKE_COMMAND} -E create_symlink 
-                "${_lib_fname}.${SOVERSION}" "${LIB_INSTALL_DIR}/${_lib_fname}"
-            COMMENT "Installing library ${_lib}")
-        add_dependencies(install-${_lib} ${_lib})
-
-        # gather the custom target names in a string
-        # set(_lib_install_targets "${_lib_install_targets} install-lib${_lib}")
-        list(APPEND _lib_install_targets "install-${_lib}")
-    endforeach(_lib)
-endif(BUILD_SHARED_LIBS)
-
-get_target_property(_mdrun_path mdrun LOCATION)
-add_custom_target(install-mdrun
-    COMMAND ${CMAKE_COMMAND} -E copy "${_mdrun_path}" 
-        "${BIN_INSTALL_DIR}/${_mdrun_exec_name}"
-    COMMENT "Installing mdrun")
-add_dependencies(install-mdrun mdrun ${_lib_install_targets})
+# Create the custom install-mdrun target
+if (BUILD_SHARED_LIBS)
+    # If shared libraries are used, we need to install the libraries in
+    # addition to the mdrun binary.
+    add_custom_target(install-mdrun
+        COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
+                -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+        COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
+                -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+        COMMENT "Installing mdrun")
+else (BUILD_SHARED_LIBS)
+    add_custom_target(install-mdrun
+        COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
+                -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+        COMMENT "Installing mdrun")
+endif (BUILD_SHARED_LIBS)
+add_dependencies(install-mdrun mdrun)
 
 endif(GMX_FAHCORE)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmxpreprocess.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig 
-  RENAME "libgmxpreprocess${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc
+        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+        RENAME "libgmxpreprocess${GMX_LIBS_SUFFIX}.pc"
+        COMPONENT development)
index 72efea555fdc5b0941eddad56fe7f907048b7078..ee14674d5c8dcd66988efadd7bf35c851cd5a1c4 100644 (file)
@@ -169,8 +169,12 @@ static void check_cg_sizes(const char *topfn,t_block *cgs,warninp_t wi)
     {
         maxsize = max(maxsize,cgs->index[cg+1]-cgs->index[cg]);
     }
-    if (maxsize > 10)
+    
+    if (maxsize > MAX_CHARGEGROUP_SIZE)
+    {
+        gmx_fatal(FARGS,"The largest charge group contains %d atoms. The maximum is %d.",maxsize,MAX_CHARGEGROUP_SIZE);
+    }
+    else if (maxsize > 10)
     {
         set_warning_line(wi,topfn,-1);
         sprintf(warn_buf,
@@ -1054,6 +1058,42 @@ static int count_constraints(gmx_mtop_t *mtop,t_molinfo *mi,warninp_t wi)
   return count;
 }
 
+static void check_gbsa_params_charged(gmx_mtop_t *sys, gpp_atomtype_t atype)
+{
+    int i,nmiss,natoms,mt;
+    real q;
+    const t_atoms *atoms;
+  
+    nmiss = 0;
+    for(mt=0;mt<sys->nmoltype;mt++)
+    {
+        atoms  = &sys->moltype[mt].atoms;
+        natoms = atoms->nr;
+
+        for(i=0;i<natoms;i++)
+        {
+            q = atoms->atom[i].q;
+            if ((get_atomtype_radius(atoms->atom[i].type,atype)    == 0  ||
+                 get_atomtype_vol(atoms->atom[i].type,atype)       == 0  ||
+                 get_atomtype_surftens(atoms->atom[i].type,atype)  == 0  ||
+                 get_atomtype_gb_radius(atoms->atom[i].type,atype) == 0  ||
+                 get_atomtype_S_hct(atoms->atom[i].type,atype)     == 0) &&
+                q != 0)
+            {
+                fprintf(stderr,"\nGB parameter(s) zero for atom type '%s' while charge is %g\n",
+                        get_atomtype_name(atoms->atom[i].type,atype),q);
+                nmiss++;
+            }
+        }
+    }
+
+    if (nmiss > 0)
+    {
+        gmx_fatal(FARGS,"Can't do GB electrostatics; the implicit_genborn_params section of the forcefield has parameters with value zero for %d atomtypes that occur as charged atoms.",nmiss);
+    }
+}
+
+
 static void check_gbsa_params(t_inputrec *ir,gpp_atomtype_t atype)
 {
     int  nmiss,i;
@@ -1072,7 +1112,7 @@ static void check_gbsa_params(t_inputrec *ir,gpp_atomtype_t atype)
             get_atomtype_gb_radius(i,atype) < 0 ||
             get_atomtype_S_hct(i,atype)     < 0)
         {
-            fprintf(stderr,"GB parameter(s) missing or negative for atom type '%s'\n",
+            fprintf(stderr,"\nGB parameter(s) missing or negative for atom type '%s'\n",
                     get_atomtype_name(i,atype));
             nmiss++;
         }
@@ -1080,8 +1120,7 @@ static void check_gbsa_params(t_inputrec *ir,gpp_atomtype_t atype)
     
     if (nmiss > 0)
     {
-        gmx_fatal(FARGS,"Can't do GB electrostatics; the forcefield is missing %d values for\n"
-                  "atomtype radii, or they might be negative\n.",nmiss);
+        gmx_fatal(FARGS,"Can't do GB electrostatics; the implicit_genborn_params section of the forcefield is missing parameters for %d atomtypes or they might be negative.",nmiss);
     }
   
 }
@@ -1381,6 +1420,11 @@ int main (int argc, char *argv[])
     {
         /* Now we have renumbered the atom types, we can check the GBSA params */
         check_gbsa_params(ir,atype);
+      
+      /* Check that all atoms that have charge and/or LJ-parameters also have 
+       * sensible GB-parameters
+       */
+      check_gbsa_params_charged(sys,atype);
     }
 
        /* PELA: Copy the atomtype data to the topology atomtype list */
index acc829cb22936f6489a4b202d76c7187b00f6b5a..239f4c7eb78b03845dbc6b56186e46cec7f5d736 100644 (file)
@@ -796,7 +796,7 @@ void* openmm_init(FILE *fplog, const char *platformOptStr,
             }
 
             /* macro set at build time  */
-#ifdef OpenMM_PLUGIN_DIR
+#ifdef OPENMM_PLUGIN_DIR
             if (!hasLoadedPlugins)
             {
                 loadedPlugins = Platform::loadPluginsFromDirectory(OPENMM_PLUGIN_DIR);
index ccf7b68009a96ce69f3bd74cb5a703f54ea480ca..38229c61569d3463187bb972a92f56a906681d18 100644 (file)
@@ -1106,6 +1106,7 @@ int main(int argc, char *argv[])
   int           nid_used;
   int           this_chainstart;
   int           prev_chainstart;
+  gmx_bool      bMerged;
     
   gmx_atomprop_t aps;
   
@@ -1332,6 +1333,7 @@ int main(int argc, char *argv[])
   this_chainstart     = 0;
     
   pdb_ch=NULL;
+  bMerged = FALSE;
   for (i=0; (i<natom); i++) 
   {
       ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
@@ -1342,7 +1344,10 @@ int main(int argc, char *argv[])
       prev_resnum        = this_resnum;
       prev_chainid       = this_chainid;
       prev_chainnumber   = this_chainnumber;
-      prev_chainstart    = this_chainstart;
+      if (!bMerged)
+      {
+          prev_chainstart    = this_chainstart;
+      }
       
       this_atomname      = *pdba_all.atomname[i];
       this_atomnum       = (pdba_all.pdbinfo != NULL) ? pdba_all.pdbinfo[i].atomnr : i+1;
@@ -1372,10 +1377,11 @@ int main(int argc, char *argv[])
               select[0] = 'n';
           }
           
-          if (select[0] == 'y') 
+          bMerged = (select[0] == 'y');
+          if (bMerged) 
           {
               pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] = 
-              pdba_all.atom[i].resind - prev_chainstart;
+                  pdba_all.atom[i].resind - prev_chainstart;
               pdb_ch[nch-1].nterpairs++;
               srenew(pdb_ch[nch-1].chainstart,pdb_ch[nch-1].nterpairs+1);
           }
@@ -1560,23 +1566,25 @@ int main(int argc, char *argv[])
                  bHisMan,bArgMan,bGlnMan,angle,distance,&symtab,
                  nrtprename,rtprename);
       
-    for(i=0; i<cc->nterpairs; i++) {
-        
-      cc->chainstart[cc->nterpairs] = pdba->nres;
-                
-      find_nc_ter(pdba,cc->chainstart[i],cc->chainstart[i+1],
-                 &(cc->r_start[i]),&(cc->r_end[i]),rt);    
+        cc->chainstart[cc->nterpairs] = pdba->nres;
+        j = 0;
+        for(i=0; i<cc->nterpairs; i++)
+        {
+            find_nc_ter(pdba,cc->chainstart[i],cc->chainstart[i+1],
+                        &(cc->r_start[j]),&(cc->r_end[j]),rt);    
       
-        
-      if ( (cc->r_start[i]<0) || (cc->r_end[i]<0) ) {
-       printf("Problem with chain definition, or missing terminal residues.\n"
-              "This chain does not appear to contain a recognized chain molecule.\n"
-           "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
-           
-       cc->nterpairs = i;
-       break;
-      }
-    }
+            if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
+            {
+                j++;
+            }
+        }
+        cc->nterpairs = j;
+        if (cc->nterpairs == 0)
+        {
+            printf("Problem with chain definition, or missing terminal residues.\n"
+                   "This chain does not appear to contain a recognized chain molecule.\n"
+                   "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
+        }
 
     /* Check for disulfides and other special bonds */
     nssbonds = mk_specbonds(pdba,x,bCysMan,&ssbonds,bVerbose);
@@ -1621,14 +1629,20 @@ int main(int argc, char *argv[])
             {
                 if(bTerMan && ntdblist>1)
                 {
-                    cc->ntdb[i] = choose_ter(ntdblist,tdblist,"Select start terminus type");
+                    sprintf(select,"Select start terminus type for %s-%d",
+                            *pdba->resinfo[cc->r_start[i]].name,
+                            pdba->resinfo[cc->r_start[i]].nr);
+                    cc->ntdb[i] = choose_ter(ntdblist,tdblist,select);
                 }
                 else
                 {
                     cc->ntdb[i] = tdblist[0];
                 }
                 
-                printf("Start terminus: %s\n",(cc->ntdb[i])->name);
+                printf("Start terminus %s-%d: %s\n",
+                       *pdba->resinfo[cc->r_start[i]].name,
+                       pdba->resinfo[cc->r_start[i]].nr,
+                       (cc->ntdb[i])->name);
                 sfree(tdblist);
             }
         }
@@ -1654,13 +1668,19 @@ int main(int argc, char *argv[])
             {
                 if(bTerMan && ntdblist>1)
                 {
-                    cc->ctdb[i] = choose_ter(ntdblist,tdblist,"Select end terminus type");
+                    sprintf(select,"Select end terminus type for %s-%d",
+                            *pdba->resinfo[cc->r_end[i]].name,
+                            pdba->resinfo[cc->r_end[i]].nr);
+                    cc->ctdb[i] = choose_ter(ntdblist,tdblist,select);
                 }
                 else
                 {
                     cc->ctdb[i] = tdblist[0];
                 }
-                printf("End terminus: %s\n",(cc->ctdb[i])->name);
+                printf("End terminus %s-%d: %s\n",
+                       *pdba->resinfo[cc->r_end[i]].name,
+                       pdba->resinfo[cc->r_end[i]].nr,
+                       (cc->ctdb[i])->name);
                 sfree(tdblist);
             }
         }
@@ -1819,7 +1839,7 @@ int main(int argc, char *argv[])
     pdb2top(top_file2,posre_fn,molname,pdba,&x,atype,&symtab,
            nrtp,restp,
            restp_chain,hb_chain,
-           cc->nterpairs,cc->ntdb,cc->ctdb,cc->r_start,cc->r_end,bAllowMissing,
+           cc->nterpairs,cc->ntdb,cc->ctdb,bAllowMissing,
            bVsites,bVsiteAromatics,forcefield,ffdir,
            mHmult,nssbonds,ssbonds,
            long_bond_dist,short_bond_dist,bDeuterate,bChargeGroups,bCmap,
index 95e601c9663679f7952e8ad8d65c6bdbfd0ca3fc..bc03decca477ca7a0f35808d68dbb096ce756a08 100644 (file)
@@ -1398,7 +1398,7 @@ void pdb2top(FILE *top_file, char *posre_fn, char *molname,
              int nrtp, t_restp rtp[],
              t_restp *restp, t_hackblock *hb,
              int nterpairs,t_hackblock **ntdb, t_hackblock **ctdb,
-             int *rn, int *rc, gmx_bool bAllowMissing,
+             gmx_bool bAllowMissing,
              gmx_bool bVsites, gmx_bool bVsiteAromatics,
              const char *ff, const char *ffdir,
              real mHmult,
index 967632833e437f0d539cd49ed20f0f85343ebf53..18f3a6e3370eb2f86c5ffd797bd2497990b59765 100644 (file)
@@ -8,9 +8,12 @@ list(REMOVE_ITEM MDLIB_SOURCES ${NOT_MDLIB_SOURCES})
 
 add_library(md ${MDLIB_SOURCES})
 target_link_libraries(md gmx ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${XML_LIBRARIES})
-set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION})
+set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
 
-install(TARGETS md DESTINATION ${LIB_INSTALL_DIR})
+install(TARGETS md DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libmd.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig RENAME "libmd${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libmd.pc
+        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+        RENAME "libmd${GMX_LIBS_SUFFIX}.pc"
+        COMPONENT development)
index 67afee52d556f138f9e1652964ebf730d2a20e1e..2bb95ba20add64878949e01c99605405883e0016 100644 (file)
@@ -1291,7 +1291,6 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         {
             cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc_null,
                     &ncons_loc,&p_ssd,&p_max,&p_imax);
-            lincsd->rmsd_data[0] = ncons_loc;
             /* Check if we are doing the second part of SD */
             if (ir->eI == eiSD2 && v == NULL)
             {
@@ -1304,6 +1303,12 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
             lincsd->rmsd_data[0] = ncons_loc;
             lincsd->rmsd_data[i] = p_ssd;
         }
+        else
+        {
+            lincsd->rmsd_data[0] = 0;
+            lincsd->rmsd_data[1] = 0;
+            lincsd->rmsd_data[2] = 0;
+        }
         if (bLog && fplog && lincsd->nc > 0)
         {
             fprintf(fplog,
index 37ca7fc4d05fd5d3151de3e9784df0fe0841978d..fb384823a24dedb5be4aaf0c1aecc364f2adfc2e 100644 (file)
@@ -1110,32 +1110,50 @@ real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ)
 static real vrescale_gamdev(int ia, gmx_rng_t rng)
 /* Gamma distribution, adapted from numerical recipes */
 {
-  int j;
-  real am,e,s,v1,v2,x,y;
-  
-  if (ia < 6) {
-    x = 1.0;
-    for(j=1; j<=ia; j++) {
-      x *= gmx_rng_uniform_real(rng);
+    int j;
+    real am,e,s,v1,v2,x,y;
+
+    if (ia < 6)
+    {
+        do
+        {
+            x = 1.0;
+            for(j=1; j<=ia; j++)
+            {
+                x *= gmx_rng_uniform_real(rng);
+            }
+        }
+        while (x == 0);
+        x = -log(x);
+    }
+    else
+    {
+        do
+        {
+            do
+            {
+                do
+                {
+                    v1 = gmx_rng_uniform_real(rng);
+                    v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
+                }
+                while (v1*v1 + v2*v2 > 1.0 ||
+                       v1*v1*GMX_REAL_MAX < 3.0*ia);
+                /* The last check above ensures that both x (3.0 > 2.0 in s)
+                 * and the pre-factor for e do not go out of range.
+                 */
+                y = v2/v1;
+                am = ia - 1;
+                s = sqrt(2.0*am + 1.0);
+                x = s*y + am;
+            }
+            while (x <= 0.0);
+            e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
+        }
+        while (gmx_rng_uniform_real(rng) > e);
     }
-    x = -log(x);
-  } else {
-    do {
-      do {
-        do {
-          v1 = gmx_rng_uniform_real(rng);
-          v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
-        } while (v1*v1 + v2*v2 > 1.0);
-        y = v2/v1;
-        am = ia - 1;
-        s = sqrt(2.0*am + 1.0);
-        x = s*y + am;
-      } while (x <= 0.0);
-      e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
-    } while (gmx_rng_uniform_real(rng) > e);
-  }
 
-  return x;
+    return x;
 }
 
 static real vrescale_sumnoises(int nn,gmx_rng_t rng)
index 30ccfd260f0b3fcfa37b02e1024643ede8495329..abc08743e2af1b6e6b8dea8bec54e9db7a179b65 100644 (file)
@@ -8596,7 +8596,7 @@ void dd_partition_system(FILE            *fplog,
     if (ir->bRot)
     {
         /* Update the local rotation groups */
-        dd_make_local_rotation_groups(dd,ir->rot,mdatoms);
+        dd_make_local_rotation_groups(dd,ir->rot);
     }
 
 
index 1f225c3411d048c0368b9d8b9277f0aab78d7b4d..89740bcb51634ddc4e0966b6e09eaaf0f4044384 100644 (file)
@@ -202,7 +202,7 @@ void pr_ebin(FILE *fp,t_ebin *eb,int index,int nener,int nperline,
     char buf[30];
 
     rc = 0;
-       
+
     if (index < 0)
     {
         gmx_fatal(FARGS,"Invalid index in pr_ebin: %d",index);
@@ -215,11 +215,12 @@ void pr_ebin(FILE *fp,t_ebin *eb,int index,int nener,int nperline,
     {
         nener = index + nener;
     }
-       for(i=index; (i<nener) && rc>=0; ) {
-               if (bPrHead)
+    for(i=index; (i<nener) && rc>=0; ) 
+    {
+        if (bPrHead)
         {
-                       i0=i;
-                       for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
+            i0=i;
+            for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
             {
                 if (strncmp(eb->enm[i].name,"Pres",4) == 0)
                 {
@@ -232,33 +233,34 @@ void pr_ebin(FILE *fp,t_ebin *eb,int index,int nener,int nperline,
                     rc = fprintf(fp,"%15s",eb->enm[i].name);
                 }
             }
-                       
-                       if (rc >= 0)
+
+            if (rc >= 0)
             {
-                               rc = fprintf(fp,"\n");
+                rc = fprintf(fp,"\n");
             }
-            
-                       i=i0;
-               }
-               for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
+
+            i=i0;
+        }
+        for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
         {
             switch (prmode) {
-            case eprNORMAL: ee = eb->e[i].e; break;
-            case eprAVER:   ee = eb->e_sim[i].esum/eb->nsum_sim; break;
-            default: gmx_fatal(FARGS,"Invalid print mode %d in pr_ebin",prmode);
+                case eprNORMAL: ee = eb->e[i].e; break;
+                case eprAVER:   ee = eb->e_sim[i].esum/eb->nsum_sim; break;
+                default: gmx_fatal(FARGS,"Invalid print mode %d in pr_ebin",
+                                   prmode);
             }
-                       
-                       rc = fprintf(fp,"   %12.5e",ee);
-               }
-               if (rc >= 0)
+
+            rc = fprintf(fp,"   %12.5e",ee);
+        }
+        if (rc >= 0)
         {
-                       rc = fprintf(fp,"\n");
+            rc = fprintf(fp,"\n");
         }
-       }
-       if (rc < 0)
-       
-               gmx_fatal(FARGS,"Cannot write to logfile; maybe you are out of quota?");
-       }
+    }
+    if (rc < 0)
+    { 
+        gmx_fatal(FARGS,"Cannot write to logfile; maybe you are out of quota?");
+    }
 }
 
 #ifdef DEBUGEBIN
index 325ab67156555c26a32fea5673271169e9bd526d..6fa2d7b6b048d625d086c9fc3a216e630e9047d8 100644 (file)
@@ -154,43 +154,6 @@ int print_nblist(int natoms, t_nblist *nl)
     return 0;    
 }
 
-typedef union {
-    real numlog;
-    int exp;
-} u_table;
-
-void fill_log_table(const int n, real *table)
-{
-    u_table log_table;
-    real logfactor;
-    int i;
-    
-    int incr = 1 << (23-n);
-    int p=pow(2,n);
-
-    logfactor = 1.0/log(2.0);
-    
-    log_table.exp = 0x3F800000;
-    
-    for(i=0;i<p;++i)
-    {
-        /* log2(numlog)=log(numlog)/log(2.0) */
-        table[i]=log(log_table.numlog)*logfactor; 
-        log_table.exp+=incr;
-    }
-}
-
-
-real table_log(real val, const real *table, const int n)
-{
-    int *const exp_ptr = ((int*)&val);
-    int x              = *exp_ptr;
-    const int log_2    = ((x>>23) & 255) - 127;
-    x &= 0x7FFFFF;
-    x = x >> (23-n);
-    val = table[x];
-    return ((val+log_2)*0.69314718);  
-}
 
 void gb_pd_send(t_commrec *cr, real *send_data, int nr)
 {
@@ -418,11 +381,6 @@ int init_gb_still(const t_commrec *cr, t_forcerec  *fr,
     return 0;
 }
 
-
-
-#define LOG_TABLE_ACCURACY 15 /* Accuracy of the table logarithm */
-
-
 /* Initialize all GB datastructs and compute polarization energies */
 int init_gb(gmx_genborn_t **p_born,
             const t_commrec *cr, t_forcerec *fr, const t_inputrec *ir,
@@ -443,7 +401,6 @@ int init_gb(gmx_genborn_t **p_born,
     snew(born,1);
     *p_born = born;
 
-       born->nr = fr->natoms_force;
     born->nr  = natoms;
     
     snew(born->drobc, natoms);
@@ -537,12 +494,6 @@ int init_gb(gmx_genborn_t **p_born,
         }
     }
         
-    /* Init the logarithm table */
-    p=pow(2,LOG_TABLE_ACCURACY);
-    snew(born->log_table, p);
-    
-    fill_log_table(LOG_TABLE_ACCURACY, born->log_table);
-    
     /* Allocate memory for work arrays for temporary use */
     snew(born->work,natoms+4);
     snew(born->count,natoms);
@@ -722,8 +673,8 @@ calc_gb_rad_hct(t_commrec *cr,t_forcerec *fr,int natoms, gmx_localtop_t *top,
     {
         ai     = nl->iinr[i];
             
-        nj0    = nl->jindex[ai];            
-        nj1    = nl->jindex[ai+1];
+        nj0    = nl->jindex[i];            
+        nj1    = nl->jindex[i+1];
         
         /* Load shifts for this list */
         shift   = nl->shift[i];
@@ -789,8 +740,6 @@ calc_gb_rad_hct(t_commrec *cr,t_forcerec *fr,int natoms, gmx_localtop_t *top,
                 sk2_rinv = sk2*rinv;
                 prod     = 0.25*sk2_rinv;
                 
-                /* log_term = table_log(uij*lij_inv,born->log_table,
-                   LOG_TABLE_ACCURACY); */
                 log_term = log(uij*lij_inv);
                 
                 tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term +
@@ -1013,7 +962,6 @@ calc_gb_rad_obc(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
                 
                 log_term = log(uij*lij_inv);
                 
-                /* log_term = table_log(uij*lij_inv,born->log_table,LOG_TABLE_ACCURACY); */
                 tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
                 
                 if(rai < sk-dr)
@@ -1604,35 +1552,31 @@ real calc_gb_chainrule(int natoms, t_nblist *nl, real *dadx, real *dvda, rvec x[
     n  = 0;    
     rb = born->work;
         
-    
-    n0 = md->start;
-    n1 = md->start+md->homenr+1+natoms/2;
-    
+  n0 = 0;
+  n1 = natoms;
+  
     if(gb_algorithm==egbSTILL) 
     {
         for(i=n0;i<n1;i++)
         {
-            k = i % natoms;
-            rbi   = born->bRad[k];
-            rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
+          rbi   = born->bRad[i];
+          rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
         }
     }
     else if(gb_algorithm==egbHCT) 
     {
         for(i=n0;i<n1;i++)
         {
-            k = i % natoms;
-            rbi   = born->bRad[k];
-            rb[k] = rbi * rbi * dvda[k];
+          rbi   = born->bRad[i];
+          rb[i] = rbi * rbi * dvda[i];
         }
     }
     else if(gb_algorithm==egbOBC) 
     {
         for(i=n0;i<n1;i++)
         {
-            k = i % natoms;
-            rbi   = born->bRad[k];
-            rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
+          rbi   = born->bRad[i];
+          rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
         }
     }
     
@@ -1784,17 +1728,17 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t
 #if ( defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || (defined(GMX_DOUBLE) && defined(GMX_SSE2)) )
     if(fr->UseOptimizedKernels)
     {
-        calc_gb_chainrule_sse2_double(born->nr, &(fr->gblist), fr->dadx, fr->dvda, 
+        calc_gb_chainrule_sse2_double(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
                                       x[0], f[0], fr->fshift[0],  fr->shift_vec[0],
                                       gb_algorithm, born, md); 
     }
     else
     {
-        calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda, 
+        calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
                           x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md); 
     }
 #else
-    calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda, 
+    calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
                       x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);
 #endif
     
@@ -1804,19 +1748,19 @@ calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_localtop_t
     /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
     if(fr->UseOptimizedKernels)
     {
-        calc_gb_chainrule_sse2_single(born->nr, &(fr->gblist), fr->dadx, fr->dvda, 
+        calc_gb_chainrule_sse2_single(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
                                       x[0], f[0], fr->fshift[0], fr->shift_vec[0], 
                                       gb_algorithm, born, md);
     }
     else
     {
-        calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda, 
+        calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
                           x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);    
     }
     
 #else
     /* Calculate the forces due to chain rule terms with non sse code */
-    calc_gb_chainrule(born->nr, &(fr->gblist), fr->dadx, fr->dvda, 
+    calc_gb_chainrule(fr->natoms_force, &(fr->gblist), fr->dadx, fr->dvda, 
                       x, f, fr->fshift, fr->shift_vec, gb_algorithm, born, md);    
 #endif    
 #endif
index 43ce789971492c29ae3c81ec828315c3a7d0d8de..36111bef22132bf74ac91a79085b45a803eecc1c 100644 (file)
@@ -366,7 +366,7 @@ genborn_allvsall_calc_still_radii(t_forcerec *           fr,
     
     natoms              = mdatoms->nr;
        ni0                 = mdatoms->start;
-       ni1                 = mdatoms->homenr;
+       ni1                 = mdatoms->start+mdatoms->homenr;
     factor  = 0.5*ONE_4PI_EPS0;
     n = 0;
     
@@ -583,7 +583,7 @@ genborn_allvsall_calc_hct_obc_radii(t_forcerec *           fr,
     
     natoms              = mdatoms->nr;
        ni0                 = mdatoms->start;
-       ni1                 = mdatoms->homenr;
+       ni1                 = mdatoms->start+mdatoms->homenr;
 
     n = 0;
     prod = 0;
@@ -972,7 +972,7 @@ genborn_allvsall_calc_chainrule(t_forcerec *           fr,
     
     natoms              = mdatoms->nr;
        ni0                 = mdatoms->start;
-       ni1                 = mdatoms->homenr;
+       ni1                 = mdatoms->start+mdatoms->homenr;
     dadx                = fr->dadx;
     
     aadata = (gmx_allvsallgb2_data_t *)work;
index c2673c972ca797048016a7aedf499738201e3c9f..fb207b4f5f5e07a951489407425188afec23d96e 100644 (file)
@@ -792,145 +792,142 @@ calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dv
     
        rb     = born->work; 
     
-    jjnr   = nl->jjnr;
-    
+  jjnr   = nl->jjnr;
+  
        /* Loop to get the proper form for the Born radius term, sse style */   
-    n0 = md->start;
-    n1 = md->start+md->homenr+1+natoms/2;
+  n0 = 0;
+  n1 = natoms;
     
        if(gb_algorithm==egbSTILL) 
        {
                for(i=n0;i<n1;i++)
                {
-            k = i % natoms;
-                       rbi   = born->bRad[k];
-                       rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
+      rbi   = born->bRad[i];
+                       rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
                }
        }
        else if(gb_algorithm==egbHCT) 
        {
                for(i=n0;i<n1;i++)
                {
-            k = i % natoms;
-                       rbi   = born->bRad[k];
-                       rb[k] = rbi * rbi * dvda[k];
+      rbi   = born->bRad[i];
+                       rb[i] = rbi * rbi * dvda[i];
                }
        }
        else if(gb_algorithm==egbOBC) 
        {
                for(i=n0;i<n1;i++)
                {
-            k = i % natoms;
-                       rbi   = born->bRad[k];
-                       rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
+      rbi   = born->bRad[k];
+                       rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
                }
        }
     
-    jz = _mm_setzero_pd();
-    
-    n = j3A = j3B = 0;
-    
+  jz = _mm_setzero_pd();
+  
+  n = j3A = j3B = 0;
+  
        for(i=0;i<nl->nri;i++)
        {
-        ii     = nl->iinr[i];
+    ii     = nl->iinr[i];
                ii3        = ii*3;
-        is3    = 3*nl->shift[i];     
-        shX    = shiftvec[is3];  
-        shY    = shiftvec[is3+1];
-        shZ    = shiftvec[is3+2];
-        nj0    = nl->jindex[i];      
-        nj1    = nl->jindex[i+1];    
-        
-        ix     = _mm_set1_pd(shX+x[ii3+0]);
+    is3    = 3*nl->shift[i];     
+    shX    = shiftvec[is3];  
+    shY    = shiftvec[is3+1];
+    shZ    = shiftvec[is3+2];
+    nj0    = nl->jindex[i];      
+    nj1    = nl->jindex[i+1];    
+    
+    ix     = _mm_set1_pd(shX+x[ii3+0]);
                iy     = _mm_set1_pd(shY+x[ii3+1]);
                iz     = _mm_set1_pd(shZ+x[ii3+2]);
-                               
+    
                rbai   = _mm_load1_pd(rb+ii);                   
                fix    = _mm_setzero_pd();
                fiy    = _mm_setzero_pd();
                fiz    = _mm_setzero_pd();      
+    
         
-        
-        for(k=nj0;k<nj1-1;k+=2)
+    for(k=nj0;k<nj1-1;k+=2)
                {
                        jnrA        = jjnr[k];   
                        jnrB        = jjnr[k+1];
-            
-            j3A         = 3*jnrA;  
+      
+      j3A         = 3*jnrA;  
                        j3B         = 3*jnrB;
             
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
-            
+      GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
+      
                        dx          = _mm_sub_pd(ix,jx);
                        dy          = _mm_sub_pd(iy,jy);
                        dz          = _mm_sub_pd(iz,jz);
-            
-            GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
-            
+      
+      GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
+      
                        /* load chain rule terms for j1-4 */
                        f_gb        = _mm_load_pd(dadx);
                        dadx += 2;
                        f_gb_ai     = _mm_load_pd(dadx);
                        dadx += 2;
                        
-            /* calculate scalar force */
-            f_gb    = _mm_mul_pd(f_gb,rbai); 
-            f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
-            f_gb    = _mm_add_pd(f_gb,f_gb_ai);
-            
-            tx     = _mm_mul_pd(f_gb,dx);
-            ty     = _mm_mul_pd(f_gb,dy);
-            tz     = _mm_mul_pd(f_gb,dz);
-            
-            fix    = _mm_add_pd(fix,tx);
-            fiy    = _mm_add_pd(fiy,ty);
-            fiz    = _mm_add_pd(fiz,tz);
-            
-            GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
+      /* calculate scalar force */
+      f_gb    = _mm_mul_pd(f_gb,rbai); 
+      f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
+      f_gb    = _mm_add_pd(f_gb,f_gb_ai);
+      
+      tx     = _mm_mul_pd(f_gb,dx);
+      ty     = _mm_mul_pd(f_gb,dy);
+      tz     = _mm_mul_pd(f_gb,dz);
+      
+      fix    = _mm_add_pd(fix,tx);
+      fiy    = _mm_add_pd(fiy,ty);
+      fiz    = _mm_add_pd(fiz,tz);
+      
+      GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
                }
-        
+    
                /*deal with odd elements */
                if(k<nj1) 
         {
-                       jnrA        = jjnr[k];   
-            j3A         = 3*jnrA;  
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
-            
-                       dx          = _mm_sub_sd(ix,jx);
-                       dy          = _mm_sub_sd(iy,jy);
-                       dz          = _mm_sub_sd(iz,jz);
-            
-            GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
-            
-                       /* load chain rule terms */
-                       f_gb        = _mm_load_pd(dadx);
-                       dadx += 2;
-                       f_gb_ai     = _mm_load_pd(dadx);
-                       dadx += 2;
-                       
-            /* calculate scalar force */
-            f_gb    = _mm_mul_sd(f_gb,rbai); 
-            f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
-            f_gb    = _mm_add_sd(f_gb,f_gb_ai);
-            
-            tx     = _mm_mul_sd(f_gb,dx);
-            ty     = _mm_mul_sd(f_gb,dy);
-            tz     = _mm_mul_sd(f_gb,dz);
-            
-            fix    = _mm_add_sd(fix,tx);
-            fiy    = _mm_add_sd(fiy,ty);
-            fiz    = _mm_add_sd(fiz,tz);
-            
-            GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
+          jnrA        = jjnr[k];   
+          j3A         = 3*jnrA;  
+          
+          GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
+          
+          dx          = _mm_sub_sd(ix,jx);
+          dy          = _mm_sub_sd(iy,jy);
+          dz          = _mm_sub_sd(iz,jz);
+          
+          GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
+          
+          /* load chain rule terms */
+          f_gb        = _mm_load_pd(dadx);
+          dadx += 2;
+          f_gb_ai     = _mm_load_pd(dadx);
+          dadx += 2;
+          
+          /* calculate scalar force */
+          f_gb    = _mm_mul_sd(f_gb,rbai); 
+          f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
+          f_gb    = _mm_add_sd(f_gb,f_gb_ai);
+          
+          tx     = _mm_mul_sd(f_gb,dx);
+          ty     = _mm_mul_sd(f_gb,dy);
+          tz     = _mm_mul_sd(f_gb,dz);
+          
+          fix    = _mm_add_sd(fix,tx);
+          fiy    = _mm_add_sd(fiy,ty);
+          fiz    = _mm_add_sd(fiz,tz);
+          
+          GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
         } 
-        
+    
                /* fix/fiy/fiz now contain four partial force terms, that all should be
-         * added to the i particle forces and shift forces. 
-         */
+     * added to the i particle forces and shift forces. 
+     */
                gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
        }       
-    
+  
        return 0;       
 }
 
index 99400b955ddd1b4c2c69b0d5996cc62bb08e23f7..9ff8bfc362f42f58d6110ad9134281fbb059754d 100644 (file)
@@ -1344,34 +1344,31 @@ float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float
        /* Loop to get the proper form for the Born radius term, sse style */
        offset=natoms%4;
        
-    n0 = md->start;
-    n1 = md->start+md->homenr+1+natoms/2;
-    
+  n0 = 0;
+  n1 = natoms;
+  
        if(gb_algorithm==egbSTILL) 
        {
                for(i=n0;i<n1;i++)
                {
-            k = i % natoms;
-                       rbi   = born->bRad[k];
-                       rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
+      rbi   = born->bRad[i];
+                       rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
                }
        }
        else if(gb_algorithm==egbHCT) 
        {
                for(i=n0;i<n1;i++)
                {
-            k = i % natoms;
-                       rbi   = born->bRad[k];
-                       rb[k] = rbi * rbi * dvda[k];
+      rbi   = born->bRad[i];
+                       rb[i] = rbi * rbi * dvda[i];
                }
        }
        else if(gb_algorithm==egbOBC) 
        {
                for(i=n0;i<n1;i++)
                {
-            k = i % natoms;
-                       rbi   = born->bRad[k];
-                       rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
+      rbi   = born->bRad[i];
+                       rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
                }
        }
     
index 23999584166b17d8a492bdd6e21379bc8e581dd5..342bba6ceac386795fc58187dfcd12c57b137bc6 100644 (file)
@@ -981,5 +981,5 @@ gmx_many_fft_destroy(gmx_fft_t    fft)
 
 #else
 int
-gmx_fft_fftw2_empty;
-#endif /* GMX_FFT_FFTW2 */
+gmx_fft_fftw3_empty;
+#endif /* GMX_FFT_FFTW3 */
index 851153811bbd80b533e87ca86612c179eab3af43..47fbcdc4802993e1a1c880f016622e62c0c0aac5 100644 (file)
@@ -330,18 +330,23 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[])
 }
 
 static void print_cycles(FILE *fplog, double c2t, const char *name, int nnodes,
-                        int n, gmx_cycles_t c, gmx_cycles_t tot)
+                         int n, double c, double tot)
 {
-  char num[11];
+    char num[11];
   
-  if (c > 0) {
-    if (n > 0)
-      sprintf(num,"%10d",n);
-    else
-      sprintf(num,"          ");
-    fprintf(fplog," %-19s %4d %10s %12.3f %10.1f   %5.1f\n",
-           name,nnodes,num,c*1e-9,c*c2t,100*(double)c/(double)tot);
-  }
+    if (c > 0)
+    {
+        if (n > 0)
+        {
+            sprintf(num,"%10d",n);
+        }
+        else
+        {
+            sprintf(num,"          ");
+        }
+        fprintf(fplog," %-19s %4d %10s %12.3f %10.1f   %5.1f\n",
+                name,nnodes,num,c*1e-9,c*c2t,100*c/tot);
+    }
 }
 
 static gmx_bool subdivision(int ewc)
@@ -350,7 +355,7 @@ static gmx_bool subdivision(int ewc)
 }
 
 void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
-                    gmx_wallcycle_t wc, double cycles[])
+                     gmx_wallcycle_t wc, double cycles[])
 {
     double c2t,tot,sum;
     int    i,j,npp;
index d0fad47e1a09ebed74c47e40d38e8a7fab5c46ee..394ca899232c84da7056ea7b194111faabeed95c 100644 (file)
@@ -63,8 +63,10 @@ static const char *conrmsd_nm[] = { "Constr. rmsd", "Constr.2 rmsd" };
 
 static const char *boxs_nm[] = { "Box-X", "Box-Y", "Box-Z" };
 
-static const char *tricl_boxs_nm[] = { "Box-XX", "Box-YX", "Box-YY",
-    "Box-ZX", "Box-ZY", "Box-ZZ" };
+static const char *tricl_boxs_nm[] = { 
+    "Box-XX", "Box-YY", "Box-ZZ",
+    "Box-YX", "Box-ZX", "Box-ZY" 
+};
 
 static const char *vol_nm[] = { "Volume" };
 
@@ -288,8 +290,9 @@ t_mdebin *init_mdebin(ener_file_t fp_ene,
     }
     if (md->bDynBox)
     {
-        md->ib    = get_ebin_space(md->ebin, md->bTricl ? NTRICLBOXS :
-                                   NBOXS, md->bTricl ? tricl_boxs_nm : boxs_nm,
+        md->ib    = get_ebin_space(md->ebin, 
+                                   md->bTricl ? NTRICLBOXS : NBOXS, 
+                                   md->bTricl ? tricl_boxs_nm : boxs_nm,
                                    unit_length);
         md->ivol  = get_ebin_space(md->ebin, 1, vol_nm,  unit_volume);
         md->idens = get_ebin_space(md->ebin, 1, dens_nm, unit_density_SI);
@@ -677,20 +680,23 @@ void upd_mdebin(t_mdebin *md, gmx_bool write_dhdl,
     }
     if (md->bDynBox)
     {
+        int nboxs;
         if(md->bTricl)
         {
             bs[0] = box[XX][XX];
-            bs[1] = box[YY][XX];
-            bs[2] = box[YY][YY];
-            bs[3] = box[ZZ][XX];
-            bs[4] = box[ZZ][YY];
-            bs[5] = box[ZZ][ZZ];
+            bs[1] = box[YY][YY];
+            bs[2] = box[ZZ][ZZ];
+            bs[3] = box[YY][XX];
+            bs[4] = box[ZZ][XX];
+            bs[5] = box[ZZ][YY];
+            nboxs=NTRICLBOXS;
         }
         else
         {
             bs[0] = box[XX][XX];
             bs[1] = box[YY][YY];
             bs[2] = box[ZZ][ZZ];
+            nboxs=NBOXS;
         }
         vol  = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
         dens = (tmass*AMU)/(vol*NANO*NANO*NANO);
@@ -712,7 +718,8 @@ void upd_mdebin(t_mdebin *md, gmx_bool write_dhdl,
                 }
             }
         }
-        add_ebin(md->ebin,md->ib   ,NBOXS,bs   ,bSum);
+
+        add_ebin(md->ebin,md->ib   ,nboxs,bs   ,bSum);
         add_ebin(md->ebin,md->ivol ,1    ,&vol ,bSum);
         add_ebin(md->ebin,md->idens,1    ,&dens,bSum);
         add_ebin(md->ebin,md->ipv  ,1    ,&pv  ,bSum);
index aec0dc6a5ae16cad21cbbb4aee7fb425fd7bc810..c30a47540a1bccb43c958d8888df921372abe09e 100644 (file)
@@ -1549,9 +1549,6 @@ static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
     
     
     order = pme->pme_order;
-    thx   = atc->theta[XX];
-    thy   = atc->theta[YY];
-    thz   = atc->theta[ZZ];
     
     energy = 0;
     for(n=0; (n<atc->n); n++) {
@@ -2352,7 +2349,7 @@ void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
     /* We only use the A-charges grid */
     grid = pme->pmegridA;
 
-    spread_on_grid(pme,atc,grid,TRUE,FALSE);
+    spread_on_grid(pme,atc,NULL,TRUE,FALSE);
 
     *V = gather_energy_bsplines(pme,grid,atc);
 }
@@ -2483,6 +2480,11 @@ int gmx_pme_do(gmx_pme_t pme,
         }
         atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
     }
+    else
+    {
+        /* This could be necessary for TPI */
+        pme->atc[0].n = homenr;
+    }
     
     for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
         if (q == 0) {
@@ -2657,12 +2659,12 @@ int gmx_pme_do(gmx_pme_t pme,
             }
 #endif
             where();
+
+            unwrap_periodic_pmegrid(pme,grid);
         }
 
         if (flags & GMX_PME_CALC_F)
         {
-            unwrap_periodic_pmegrid(pme,grid);
-            
             /* interpolate forces for our local atoms */
             GMX_BARRIER(cr->mpi_comm_mygroup);
             GMX_MPE_LOG(ev_gather_f_bsplines_start);
index d2a134f2a4a025c7ad0df6ca33d4d7c9acc17a3d..00dda8707ecdec44174d7a1e8a5ea7cad1828713 100644 (file)
@@ -57,6 +57,7 @@
 #include "gmxfio.h"
 #include "mpelogging.h"
 #include "groupcoord.h"
+#include "pull_rotation.h"
 #include "gmx_sort.h"
 
 
@@ -3014,7 +3015,7 @@ static void init_rot_group(FILE *fplog,t_commrec *cr,int g,t_rotgrp *rotg,
 }
 
 
-extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot,t_mdatoms *md)
+extern void dd_make_local_rotation_groups(gmx_domdec_t *dd,t_rot *rot)
 {
     gmx_ga2la_t ga2la;
     int g;
index 753f4436cfd436164bf42e69ee76390130ebc4e5..1721941b6afddc751243f0f8dd2044b2fb21a6ce 100644 (file)
@@ -24,6 +24,7 @@ foreach(PROG ${NGMX_PROGRAMS})
 endforeach(PROG) 
 
 install(TARGETS ${NGMX_PROGRAMS}
+        COMPONENT ngmx
         RUNTIME DESTINATION ${BIN_INSTALL_DIR})
 
 endif(X11_FOUND)
index 18f8b7f6223a1bc5483e1d0f4afa843d8341f6d2..9274b1469070bcd22561311206f1d08281d44902 100644 (file)
@@ -47,7 +47,7 @@ set(GMX_TOOLS_PROGRAMS
     g_helixorient g_principal g_dipoles g_disre g_dist
     g_dyndom g_enemat g_energy g_lie g_filter g_gyrate
     g_h2order g_hbond g_helix g_mindist g_msd g_morph g_nmeig
-    g_nmens g_order g_polystat g_potential g_rama g_rdf g_rms
+    g_nmens g_order g_kinetics g_polystat g_potential g_rama g_rdf g_rms
     g_rmsf g_rotacf g_saltbr g_sas g_select g_sgangle g_sham g_sorient
     g_spol g_spatial g_tcaf g_traj g_tune_pme g_vanhove
     g_velacc g_clustsize g_mdmat g_wham g_sigeps g_bar
@@ -62,9 +62,13 @@ foreach(TOOL ${GMX_TOOLS_PROGRAMS})
 endforeach(TOOL ${GMX_TOOLS_PROGRAMS}) 
 
 
+install(TARGETS gmxana DESTINATION ${LIB_INSTALL_DIR} COMPONENT runtime)
 install(TARGETS ${GMX_TOOLS_PROGRAMS}
-       gmxana DESTINATION ${LIB_INSTALL_DIR}   
-       RUNTIME DESTINATION ${BIN_INSTALL_DIR})
+        DESTINATION ${BIN_INSTALL_DIR}
+        COMPONENT runtime)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmxana.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmxana.pc @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxana.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig RENAME "libgmxana${GMX_LIBS_SUFFIX}.pc")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxana.pc
+        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
+        RENAME "libgmxana${GMX_LIBS_SUFFIX}.pc"
+        COMPONENT development)
index 6e8c00109802c5cbfcc8718bd544211c1175680a..5c899daafc73b2f0caf0781eb347dc0139d32574 100644 (file)
@@ -2096,7 +2096,9 @@ static void read_edr_rawdh_block(samples_t **smp, int *ndu, t_enxblock *blk,
          (blk->sub[0].nr < 1) ||
          (blk->sub[1].nr < 1) )
     {
-        gmx_fatal(FARGS, "Unexpected block data in file %s", filename);
+        gmx_fatal(FARGS, 
+                  "Unexpected/corrupted block data in file %s around time %g.", 
+                  filename, start_time);
     }
    
     derivative = blk->sub[0].ival[0]; 
@@ -2120,11 +2122,11 @@ static void read_edr_rawdh_block(samples_t **smp, int *ndu, t_enxblock *blk,
          (  (derivative!=0) != (s->derivative!=0) ) )
     {
         fprintf(stderr, "Got foreign lambda=%g, expected: %g\n", 
-                s->foreign_lambda, foreign_lambda);
-        fprintf(stderr, "Got derivative=%d, derivative: %d\n", 
+                foreign_lambda, s->foreign_lambda);
+        fprintf(stderr, "Got derivative=%d, expected: %d\n", 
                 derivative, s->derivative);
-        gmx_fatal(FARGS, "Inconsistent data in file %s around t=%g", filename,
-                  start_time);
+        gmx_fatal(FARGS, "Corrupted data in file %s around t=%g.", 
+                  filename, start_time);
     }
 
     /* make room for the data */
@@ -2177,7 +2179,9 @@ static samples_t *read_edr_hist_block(int *nsamples, t_enxblock *blk,
          (blk->sub[0].nr < 2)  ||
          (blk->sub[1].nr < 2) )
     {
-        gmx_fatal(FARGS, "Unexpected block data in file %s", filename);
+        gmx_fatal(FARGS, 
+                  "Unexpected/corrupted block data in file %s around time %g", 
+                  filename, start_time);
     }
 
     nhist=blk->nsub-2;
@@ -2187,7 +2191,9 @@ static samples_t *read_edr_hist_block(int *nsamples, t_enxblock *blk,
     }
     if (nhist > 2)
     {
-        gmx_fatal(FARGS, "Unexpected block data in file %s", filename);
+        gmx_fatal(FARGS, 
+                  "Unexpected/corrupted block data in file %s around time %g", 
+                  filename, start_time);
     }
 
     snew(s, 1);
index d4769e5d73f9ef1a1c0ed944fe0edbd145445329..12e16ed21e141234d10f06363ed9bbdea27a95cb 100644 (file)
@@ -1649,7 +1649,10 @@ static void do_merge(t_hbdata *hb,int ntmp,
         srenew(hb0->h[0],4+nnframes/hb->wordlen);
         srenew(hb0->g[0],4+nnframes/hb->wordlen);  
     }
-    clearPshift(&(hb->per->pHist[a1][a2]));
+    if (NULL != hb->per->pHist)
+    {
+        clearPshift(&(hb->per->pHist[a1][a2]));
+    }
 
     /* Copy temp array to target array */
     for(m=0; (m<=nnframes); m++) {
@@ -3847,7 +3850,7 @@ int gmx_hbond(int argc,char *argv[])
                 trrStatus = (read_next_x(oenv,status,&t,natoms,x,box));
                 nframes++;      /*    +   */
             }      /*                 +   */
-#ifdef HAVE_OPENMP /* ++++++++++++++++   */
+#ifdef HAVE_OPENMP /* +++++++++++++++++   */
 #pragma omp barrier
 #endif
         } while (trrStatus);
index 9abef117ac84e00568a7034104b80bd0ce9b791c..9d1b8094ce6e10150412ac2e4a872a5372fff79d 100644 (file)
@@ -169,6 +169,13 @@ void calc_potential(const char *fn, atom_id **index, int gnx[],
          
     for (n = 0; n < nr_grps; n++)
     {      
+        /* Check whether we actually have all positions of the requested index
+         * group in the trajectory file */
+        if (gnx[n] > natoms)
+        {
+            gmx_fatal(FARGS, "You selected a group with %d atoms, but only %d atoms\n"
+                             "were found in the trajectory.\n", gnx[n], natoms);
+        }
       for (i = 0; i < gnx[n]; i++)   /* loop over all atoms in index file */
       {
        if (bSpherical)
@@ -364,7 +371,7 @@ void plot_potential(double *potential[], double *charge[], double *field[],
     for (n = 0; n < nr_grps; n++)
     {
       fprintf(pot,"   %20.16g", potential[n][slice]);
-      fprintf(fie,"   %20.16g", field[n][slice]);
+      fprintf(fie,"   %20.16g", field[n][slice]/1e9);  /* convert to V/nm */
       fprintf(cha,"   %20.16g", charge[n][slice]);
     }
     fprintf(pot,"\n");
index 15b1bd82a75931f0598eca8bcabcd34f98b4316a..ee4311d6c98b2a9549dc049b6818a254ec787f75 100644 (file)
@@ -730,7 +730,7 @@ static int split_chain(t_atoms *atoms,rvec *x,
 static gmx_bool check_have_atoms(t_atoms *atoms, char *string)
 {
   if ( atoms==NULL ) {
-    printf("Can not process '%s' without atoms info\n", string);
+    printf("Can not process '%s' without atom info, use option -f\n", string);
     return FALSE;
   } else
     return TRUE;