Fixes SSE/AVX compilation under Windows

author Erik Lindahl <erik@kth.se>

Fri, 28 Dec 2012 18:40:53 +0000 (19:40 +0100)

committer Erik Lindahl <erik@kth.se>

Thu, 10 Jan 2013 07:50:03 +0000 (08:50 +0100)
author Erik Lindahl <erik@kth.se>
Fri, 28 Dec 2012 18:40:53 +0000 (19:40 +0100)
committer Erik Lindahl <erik@kth.se>
Thu, 10 Jan 2013 07:50:03 +0000 (08:50 +0100)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index c778d56c8cf0fe689358b37f204a769cba63c862..c9459e62aa6bfad12ba66e201491ce7fb2f37db5 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -736,11 +736,13 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
          GMX_TEST_CFLAG(MSVC_SSE4_CFLAG "/arch:SSE4.1" ACCELERATION_C_FLAGS)
      endif(NOT GNU_SSE4_CFLAG AND GMX_NATIVE_WINDOWS)
      if (NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
-        message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
          # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
-        # intrinsics when SSE2 support is enabled, so we try that instead.
+        # intrinsics when SSE2 support is enabled, so we try that instead first.
         if (GMX_NATIVE_WINDOWS)
              GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
+            message(WARNING "Neither SSE4.1 or SSE2 seems to be supported by your Windows compiler. Something is likely broken.")
+        else()
+            message(WARNING "No C SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance")
          endif()
      endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
  
diff --git a/include/gmx_math_x86_avx_128_fma_double.h b/include/gmx_math_x86_avx_128_fma_double.h

index b751a7e1c9d38fd0971391a7d1546c20a0901ee6..e098a4adc7358e4b773bcd65d83932488a3e4eb8 100644 (file)
--- a/include/gmx_math_x86_avx_128_fma_double.h
+++ b/include/gmx_math_x86_avx_128_fma_double.h
@@ -35,6 +35,14 @@
  #ifndef _gmx_math_x86_avx_128_fma_double_h_
  #define _gmx_math_x86_avx_128_fma_double_h_
  
+#include <immintrin.h> /* AVX */
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h> /* FMA */
+#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
  #include <math.h>
  
  #include "gmx_x86_avx_128_fma.h"
diff --git a/include/gmx_x86_avx_128_fma.h b/include/gmx_x86_avx_128_fma.h

index c704073677622ef4e5310b93f6a24744fb1bd15b..9b0e0bb1e3b79ebe75f5c7aee4986225e07676a4 100644 (file)
--- a/include/gmx_x86_avx_128_fma.h
+++ b/include/gmx_x86_avx_128_fma.h
@@ -40,6 +40,10 @@
  #ifdef HAVE_X86INTRIN_H
  #include <x86intrin.h> /* FMA */
  #endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
+
  
  #include <stdio.h>
  
diff --git a/src/gmxlib/copyrite.c b/src/gmxlib/copyrite.c

index cad3aca3bfe8db9a99c4afde84514ffc0fdb1eee..a1187f7978ae98aca74ca5dfc9257211216b917a 100644 (file)
--- a/src/gmxlib/copyrite.c
+++ b/src/gmxlib/copyrite.c
@@ -656,6 +656,7 @@ void gmx_print_version_info(FILE *fp)
  #else
      fprintf(fp, "Precision:          single\n");
  #endif
+    fprintf(fp, "Memory model:       %lu bit\n",8*sizeof(void *));
  
  #ifdef GMX_THREAD_MPI
      fprintf(fp, "MPI library:        thread_mpi\n");
diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h

index b86c3eee5715d688527f130c666757938c52d5e2..0f076850006660da3ff8fb52aacdaac864c8bbc0 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
+++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
@@ -9,16 +9,16 @@
   * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
   * a full list of developers and information, check out http://www.gromacs.org
   *
- * This program is free software; you can redistribute it and/or modify it under 
- * the terms of the GNU Lesser General Public License as published by the Free 
- * Software Foundation; either version 2 of the License, or (at your option) any 
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
   * later version.
   * As a special exception, you may use this file as part of a free software
   * library without restriction.  Specifically, if other files instantiate
   * templates or use macros or inline functions from this file, or you compile
   * this file and link it with other files to produce an executable, this
   * file does not by itself cause the resulting executable to be covered by
- * the GNU Lesser General Public License.  
+ * the GNU Lesser General Public License.
   *
   * In plain-speak: do not worry about classes/macros/templates either - only
   * changes to the library have to be LGPL, not an application linking with it.
@@ -73,16 +73,16 @@ gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
                                __m128d xmm1)
  {
      __m128d t2;
-    
+
      t2       = _mm_unpackhi_pd(xmm1,xmm1);
-    _mm_store_sd(ptrA,xmm1);                                           
-    _mm_store_sd(ptrB,t2);                                         
+    _mm_store_sd(ptrA,xmm1);
+    _mm_store_sd(ptrB,t2);
  }
  
  static void
  gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
-    _mm_store_sd(ptrA,xmm1);                                        
+    _mm_store_sd(ptrA,xmm1);
  }
  
  
@@ -92,7 +92,7 @@ gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
                                    double * gmx_restrict ptrB, __m128d xmm1)
  {
      __m128d t1;
-    
+
      t1   = _mm_unpackhi_pd(xmm1,xmm1);
      xmm1 = _mm_add_sd(xmm1,_mm_load_sd(ptrA));
      t1   = _mm_add_sd(t1,_mm_load_sd(ptrB));
@@ -104,7 +104,7 @@ static void
  gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
      __m128d tmp;
-    
+
      tmp = gmx_mm_load_1real_pd(ptrA);
      tmp = _mm_add_sd(tmp,xmm1);
      gmx_mm_store_1real_pd(ptrA,tmp);
@@ -119,12 +119,12 @@ gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
                               __m128d * gmx_restrict c12)
  {
      __m128d t1,t2,t3;
-    
+
      /* The c6/c12 array should be aligned */
      t1   = _mm_loadu_pd(p1);
      t2   = _mm_loadu_pd(p2);
-    *c6  = _mm_unpacklo_pd(t1,t2);  
-    *c12 = _mm_unpackhi_pd(t1,t2);                    
+    *c6  = _mm_unpacklo_pd(t1,t2);
+    *c12 = _mm_unpackhi_pd(t1,t2);
  }
  
  static gmx_inline void
@@ -139,21 +139,21 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
  
  static gmx_inline void
  gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
  {
      __m128d mem_xy,mem_z,mem_sxy,mem_sz;
-    
+
      mem_xy  = _mm_loadu_pd(xyz);
      mem_z   = _mm_load_sd(xyz+2);
      mem_sxy = _mm_loadu_pd(xyz_shift);
      mem_sz  = _mm_load_sd(xyz_shift+2);
-    
+
      mem_xy  = _mm_add_pd(mem_xy,mem_sxy);
      mem_z   = _mm_add_pd(mem_z,mem_sz);
-    
+
      *x1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
      *y1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
      *z1  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
@@ -162,30 +162,30 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
  {
      __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
-    
+
      t1  = _mm_loadu_pd(xyz);
      t2  = _mm_loadu_pd(xyz+2);
      t3  = _mm_loadu_pd(xyz+4);
      t4  = _mm_loadu_pd(xyz+6);
      t5  = _mm_load_sd(xyz+8);
-    
+
      sxy = _mm_loadu_pd(xyz_shift);
      sz  = _mm_load_sd(xyz_shift+2);
      szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
      syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-    
+
      t1  = _mm_add_pd(t1,sxy);
      t2  = _mm_add_pd(t2,szx);
      t3  = _mm_add_pd(t3,syz);
      t4  = _mm_add_pd(t4,sxy);
      t5  = _mm_add_sd(t5,sz);
-    
+
      *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
      *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
      *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
@@ -200,33 +200,33 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
  {
      __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
-    
+
      t1  = _mm_loadu_pd(xyz);
      t2  = _mm_loadu_pd(xyz+2);
      t3  = _mm_loadu_pd(xyz+4);
      t4  = _mm_loadu_pd(xyz+6);
      t5  = _mm_loadu_pd(xyz+8);
      t6  = _mm_loadu_pd(xyz+10);
-    
+
      sxy = _mm_loadu_pd(xyz_shift);
      sz  = _mm_load_sd(xyz_shift+2);
      szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
      syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
-    
+
      t1  = _mm_add_pd(t1,sxy);
      t2  = _mm_add_pd(t2,szx);
      t3  = _mm_add_pd(t3,syz);
      t4  = _mm_add_pd(t4,sxy);
      t5  = _mm_add_pd(t5,szx);
      t6  = _mm_add_pd(t6,syz);
-    
+
      *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
      *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
      *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
@@ -247,9 +247,9 @@ static gmx_inline void
  gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
  {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
  }
  
  static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                    __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
  {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
  }
  
  static gmx_inline void
@@ -313,7 +313,7 @@ gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                    __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
  {
-__m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
      t1           = _mm_loadu_pd(ptrA);
      t2           = _mm_loadu_pd(ptrB);
      t3           = _mm_loadu_pd(ptrA+2);
@@ -382,106 +382,16 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  
  
  /* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
-{
-    __m128d t1,t2;
-    
-    t1 = _mm_loadu_pd(ptrA);
-    t2 = _mm_load_sd(ptrA+2);
-    
-    t1 = _mm_sub_pd(t1,xy);
-    t2 = _mm_sub_sd(t2,z);
-    
-    _mm_storeu_pd(ptrA,t1);
-    _mm_store_sd(ptrA+2,t2);
-}
-
-
-static void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-    
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-    
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-    
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-    
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-    
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-    
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-    
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
-
-
  static void
  gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
      __m128d t1,t2,t3;
-    
+
      t1           = _mm_load_sd(ptrA);
      t2           = _mm_load_sd(ptrA+1);
      t3           = _mm_load_sd(ptrA+2);
-    
+
      t1           = _mm_sub_sd(t1,x1);
      t2           = _mm_sub_sd(t2,y1);
      t3           = _mm_sub_sd(t3,z1);
@@ -491,26 +401,53 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
  }
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_t1          = _mm_sub_pd(_t1,_x1);\
+_t2          = _mm_sub_pd(_t2,_z1);\
+_t3          = _mm_sub_pd(_t3,_y2);\
+_t4          = _mm_sub_pd(_t4,_x3);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
-                                       __m128d x3, __m128d y3, __m128d z3) 
+                                       __m128d x3, __m128d y3, __m128d z3)
  {
      __m128d t1,t2,t3,t4,t5;
-    
+
      t1          = _mm_loadu_pd(ptrA);
      t2          = _mm_loadu_pd(ptrA+2);
      t3          = _mm_loadu_pd(ptrA+4);
      t4          = _mm_loadu_pd(ptrA+6);
      t5          = _mm_load_sd(ptrA+8);
-    
+
      x1          = _mm_unpacklo_pd(x1,y1);
      z1          = _mm_unpacklo_pd(z1,x2);
      y2          = _mm_unpacklo_pd(y2,z2);
      x3          = _mm_unpacklo_pd(x3,y3);
      /* nothing to be done for z3 */
-    
+
      t1          = _mm_sub_pd(t1,x1);
      t2          = _mm_sub_pd(t2,z1);
      t3          = _mm_sub_pd(t3,y2);
@@ -522,31 +459,58 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+6,t4);
      _mm_store_sd(ptrA+8,t5);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_z3          = _mm_unpacklo_pd(_z3,_x4);\
+_y4          = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
                                         __m128d x3, __m128d y3, __m128d z3,
-                                       __m128d x4, __m128d y4, __m128d z4) 
+                                       __m128d x4, __m128d y4, __m128d z4)
  {
      __m128d t1,t2,t3,t4,t5,t6;
-    
+
      t1          = _mm_loadu_pd(ptrA);
      t2          = _mm_loadu_pd(ptrA+2);
      t3          = _mm_loadu_pd(ptrA+4);
      t4          = _mm_loadu_pd(ptrA+6);
      t5          = _mm_loadu_pd(ptrA+8);
      t6          = _mm_loadu_pd(ptrA+10);
-    
+
      x1          = _mm_unpacklo_pd(x1,y1);
      z1          = _mm_unpacklo_pd(z1,x2);
      y2          = _mm_unpacklo_pd(y2,z2);
      x3          = _mm_unpacklo_pd(x3,y3);
      z3          = _mm_unpacklo_pd(z3,x4);
      y4          = _mm_unpacklo_pd(y4,z4);
-    
+
      _mm_storeu_pd(ptrA,    _mm_sub_pd( t1,x1 ));
      _mm_storeu_pd(ptrA+2,  _mm_sub_pd( t2,z1 ));
      _mm_storeu_pd(ptrA+4,  _mm_sub_pd( t3,y2 ));
@@ -554,28 +518,30 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
      _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
  }
+#endif
+
  
  static void
  gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
      __m128d t1,t2,t3,t4,t5,t6,t7;
-    
+
      t1          = _mm_loadu_pd(ptrA);
      t2          = _mm_load_sd(ptrA+2);
      t3          = _mm_loadu_pd(ptrB);
      t4          = _mm_load_sd(ptrB+2);
-    
+
      t5          = _mm_unpacklo_pd(x1,y1);
      t6          = _mm_unpackhi_pd(x1,y1);
      t7          = _mm_unpackhi_pd(z1,z1);
-    
+
      t1          = _mm_sub_pd(t1,t5);
      t2          = _mm_sub_sd(t2,z1);
-    
+
      t3          = _mm_sub_pd(t3,t6);
      t4          = _mm_sub_sd(t4,t7);
-    
+
      _mm_storeu_pd(ptrA,t1);
      _mm_store_sd(ptrA+2,t2);
      _mm_storeu_pd(ptrB,t3);
@@ -583,15 +549,63 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
  }
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrB);\
+_t7          = _mm_loadu_pd(ptrB+2);\
+_t8          = _mm_loadu_pd(ptrB+4);\
+_t9          = _mm_loadu_pd(ptrB+6);\
+_t10         = _mm_load_sd(ptrB+8);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpackhi_pd(_z3,_z3);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_t6          = _mm_sub_pd(_t6,_tB);\
+_t7          = _mm_sub_pd(_t7,_tD);\
+_t8          = _mm_sub_pd(_t8,_tF);\
+_t9          = _mm_sub_pd(_t9,_tH);\
+_t10         = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
-                                       __m128d x3, __m128d y3, __m128d z3) 
+                                       __m128d x3, __m128d y3, __m128d z3)
  {
      __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
      __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI;
-    
+
      t1          = _mm_loadu_pd(ptrA);
      t2          = _mm_loadu_pd(ptrA+2);
      t3          = _mm_loadu_pd(ptrA+4);
@@ -602,7 +616,7 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      t8          = _mm_loadu_pd(ptrB+4);
      t9          = _mm_loadu_pd(ptrB+6);
      t10         = _mm_load_sd(ptrB+8);
-    
+
      tA          = _mm_unpacklo_pd(x1,y1);
      tB          = _mm_unpackhi_pd(x1,y1);
      tC          = _mm_unpacklo_pd(z1,x2);
@@ -612,19 +626,19 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      tG          = _mm_unpacklo_pd(x3,y3);
      tH          = _mm_unpackhi_pd(x3,y3);
      tI          = _mm_unpackhi_pd(z3,z3);
-    
+
      t1          = _mm_sub_pd(t1,tA);
      t2          = _mm_sub_pd(t2,tC);
      t3          = _mm_sub_pd(t3,tE);
      t4          = _mm_sub_pd(t4,tG);
      t5          = _mm_sub_sd(t5,z3);
-    
+
      t6          = _mm_sub_pd(t6,tB);
      t7          = _mm_sub_pd(t7,tD);
      t8          = _mm_sub_pd(t8,tF);
      t9          = _mm_sub_pd(t9,tH);
      t10         = _mm_sub_sd(t10,tI);
-    
+
      _mm_storeu_pd(ptrA,t1);
      _mm_storeu_pd(ptrA+2,t2);
      _mm_storeu_pd(ptrA+4,t3);
@@ -636,18 +650,76 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+6,t9);
      _mm_store_sd(ptrB+8,t10);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_t7          = _mm_loadu_pd(ptrB);\
+_t8          = _mm_loadu_pd(ptrB+2);\
+_t9          = _mm_loadu_pd(ptrB+4);\
+_t10         = _mm_loadu_pd(ptrB+6);\
+_t11         = _mm_loadu_pd(ptrB+8);\
+_t12         = _mm_loadu_pd(ptrB+10);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpacklo_pd(_z3,_x4);\
+_tJ          = _mm_unpackhi_pd(_z3,_x4);\
+_tK          = _mm_unpacklo_pd(_y4,_z4);\
+_tL          = _mm_unpackhi_pd(_y4,_z4);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_pd(_t5,_tI);\
+_t6          = _mm_sub_pd(_t6,_tK);\
+_t7          = _mm_sub_pd(_t7,_tB);\
+_t8          = _mm_sub_pd(_t8,_tD);\
+_t9          = _mm_sub_pd(_t9,_tF);\
+_t10         = _mm_sub_pd(_t10,_tH);\
+_t11         = _mm_sub_pd(_t11,_tJ);\
+_t12         = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA,  _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB,  _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
                                         __m128d x3, __m128d y3, __m128d z3,
-                                       __m128d x4, __m128d y4, __m128d z4) 
+                                       __m128d x4, __m128d y4, __m128d z4)
  {
      __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
      __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
-    
+
      t1          = _mm_loadu_pd(ptrA);
      t2          = _mm_loadu_pd(ptrA+2);
      t3          = _mm_loadu_pd(ptrA+4);
@@ -660,7 +732,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      t10         = _mm_loadu_pd(ptrB+6);
      t11         = _mm_loadu_pd(ptrB+8);
      t12         = _mm_loadu_pd(ptrB+10);
-    
+
      tA          = _mm_unpacklo_pd(x1,y1);
      tB          = _mm_unpackhi_pd(x1,y1);
      tC          = _mm_unpacklo_pd(z1,x2);
@@ -673,21 +745,21 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      tJ          = _mm_unpackhi_pd(z3,x4);
      tK          = _mm_unpacklo_pd(y4,z4);
      tL          = _mm_unpackhi_pd(y4,z4);
-    
+
      t1          = _mm_sub_pd(t1,tA);
      t2          = _mm_sub_pd(t2,tC);
      t3          = _mm_sub_pd(t3,tE);
      t4          = _mm_sub_pd(t4,tG);
      t5          = _mm_sub_pd(t5,tI);
      t6          = _mm_sub_pd(t6,tK);
-    
+
      t7          = _mm_sub_pd(t7,tB);
      t8          = _mm_sub_pd(t8,tD);
      t9          = _mm_sub_pd(t9,tF);
      t10         = _mm_sub_pd(t10,tH);
      t11         = _mm_sub_pd(t11,tJ);
      t12         = _mm_sub_pd(t12,tL);
-    
+
      _mm_storeu_pd(ptrA,  t1);
      _mm_storeu_pd(ptrA+2,t2);
      _mm_storeu_pd(ptrA+4,t3);
@@ -701,7 +773,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+8,t11);
      _mm_storeu_pd(ptrB+10,t12);
  }
-
+#endif
  
  
  static gmx_inline void
@@ -711,14 +783,41 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
  {
      fix1 = _mm_hadd_pd(fix1,fiy1);
      fiz1 = _mm_hadd_pd(fiz1,fiz1);
-    
+
      _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
      _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
-    
+
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+_mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+fix1 = _mm_add_pd(fix1,fix3);\
+_t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+fiz1 = _mm_add_sd(fiz1,_t2);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -727,32 +826,63 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        double * gmx_restrict fshiftptr)
  {
      __m128d t1,t2;
-    
+
      fix1 = _mm_hadd_pd(fix1,fiy1);
      fiz1 = _mm_hadd_pd(fiz1,fix2);
      fiy2 = _mm_hadd_pd(fiy2,fiz2);
      fix3 = _mm_hadd_pd(fix3,fiy3);
      fiz3 = _mm_hadd_pd(fiz3,fiz3);
-    
+
      _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
      _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
      _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
      _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
      _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
-    
+
      fix1 = _mm_add_pd(fix1,fix3);
      t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
      fix1 = _mm_add_pd(fix1,t1); /* x and y sums */
-    
+
      t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));
      fiz1 = _mm_add_sd(fiz1,fiz3);
      fiz1 = _mm_add_sd(fiz1,t2); /* z sum */
-    
+
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128d _t1,_t2;\
+fix1 = _mm_hadd_pd(fix1,fiy1);\
+fiz1 = _mm_hadd_pd(fiz1,fix2);\
+fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+fix3 = _mm_hadd_pd(fix3,fiy3);\
+fiz3 = _mm_hadd_pd(fiz3,fix4);\
+fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+_mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+_mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+_mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+_mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+_mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+_mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+_t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+fix1 = _mm_add_pd(fix1,_t1);\
+_t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+fix3 = _mm_add_pd(fix3,_t2);\
+fix1 = _mm_add_pd(fix1,fix3);\
+fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+fiz1 = _mm_add_sd(fiz1,fiz3);\
+_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -762,35 +892,35 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        double * gmx_restrict fshiftptr)
  {
      __m128d t1,t2;
-    
+
      fix1 = _mm_hadd_pd(fix1,fiy1);
      fiz1 = _mm_hadd_pd(fiz1,fix2);
      fiy2 = _mm_hadd_pd(fiy2,fiz2);
      fix3 = _mm_hadd_pd(fix3,fiy3);
      fiz3 = _mm_hadd_pd(fiz3,fix4);
      fiy4 = _mm_hadd_pd(fiy4,fiz4);
-    
+
      _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
      _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
      _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
      _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));
      _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));
      _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
-    
+
      t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
      fix1 = _mm_add_pd(fix1,t1);
      t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));
      fix3 = _mm_add_pd(fix3,t2);
      fix1 = _mm_add_pd(fix1,fix3); /* x and y sums */
-    
+
      fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));
      fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));
      fiz1 = _mm_add_sd(fiz1,fiz3); /* z sum */
-    
+
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-
+#endif
  
  
  static gmx_inline void
@@ -806,7 +936,7 @@ gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
  {
      pot1 = _mm_hadd_pd(pot1,pot2);
      pot2 = _mm_unpackhi_pd(pot1,pot1);
-    
+
      _mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA)));
      _mm_store_sd(ptrB,_mm_add_sd(pot2,_mm_load_sd(ptrB)));
  }
diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h

index b7c65ab38cd46a06c605eb475d458cf9730dee5e..7b663ed73392fb6e8463ed9e6dfb6829a975db6e 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
+++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
@@ -120,10 +120,10 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
  
  static gmx_inline void
  gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
  {
      __m128 t1,t2,t3,t4;
  
@@ -142,10 +142,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6;
@@ -180,11 +180,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6;
@@ -348,6 +348,72 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
  }
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                               _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+    __m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+    _t13         = _mm_unpackhi_ps(_x1,_y1);\
+    _x1          = _mm_unpacklo_ps(_x1,_y1);\
+    _t14         = _mm_unpackhi_ps(_z1,_x2);\
+    _z1          = _mm_unpacklo_ps(_z1,_x2);\
+    _t15         = _mm_unpackhi_ps(_y2,_z2);\
+    _y2          = _mm_unpacklo_ps(_y2,_z2);\
+    _t16         = _mm_unpackhi_ps(_x3,_y3);\
+    _x3          = _mm_unpacklo_ps(_x3,_y3);\
+    _t17         = _mm_permute_ps(_z3,_MM_SHUFFLE(0,0,0,1));\
+    _t18         = _mm_movehl_ps(_z3,_z3);\
+    _t19         = _mm_permute_ps(_t18,_MM_SHUFFLE(0,0,0,1));\
+    _t20         = _mm_movelh_ps(_x1,_z1);\
+    _t21         = _mm_movehl_ps(_z1,_x1);\
+    _t22         = _mm_movelh_ps(_t13,_t14);\
+    _t14         = _mm_movehl_ps(_t14,_t13);\
+    _t23         = _mm_movelh_ps(_y2,_x3);\
+    _t24         = _mm_movehl_ps(_x3,_y2);\
+    _t25         = _mm_movelh_ps(_t15,_t16);\
+    _t16         = _mm_movehl_ps(_t16,_t15);\
+    _t1          = _mm_loadu_ps(ptrA);\
+    _t2          = _mm_loadu_ps(ptrA+4);\
+    _t3          = _mm_load_ss(ptrA+8);\
+    _t1          = _mm_sub_ps(_t1,_t20);\
+    _t2          = _mm_sub_ps(_t2,_t23);\
+    _t3          = _mm_sub_ss(_t3,_z3);\
+    _mm_storeu_ps(ptrA,_t1);\
+    _mm_storeu_ps(ptrA+4,_t2);\
+    _mm_store_ss(ptrA+8,_t3);\
+    _t4          = _mm_loadu_ps(ptrB);\
+    _t5          = _mm_loadu_ps(ptrB+4);\
+    _t6          = _mm_load_ss(ptrB+8);\
+    _t4          = _mm_sub_ps(_t4,_t21);\
+    _t5          = _mm_sub_ps(_t5,_t24);\
+    _t6          = _mm_sub_ss(_t6,_t17);\
+    _mm_storeu_ps(ptrB,_t4);\
+    _mm_storeu_ps(ptrB+4,_t5);\
+    _mm_store_ss(ptrB+8,_t6);\
+    _t7          = _mm_loadu_ps(ptrC);\
+    _t8          = _mm_loadu_ps(ptrC+4);\
+    _t9          = _mm_load_ss(ptrC+8);\
+    _t7          = _mm_sub_ps(_t7,_t22);\
+    _t8          = _mm_sub_ps(_t8,_t25);\
+    _t9          = _mm_sub_ss(_t9,_t18);\
+    _mm_storeu_ps(ptrC,_t7);\
+    _mm_storeu_ps(ptrC+4,_t8);\
+    _mm_store_ss(ptrC+8,_t9);\
+    _t10         = _mm_loadu_ps(ptrD);\
+    _t11         = _mm_loadu_ps(ptrD+4);\
+    _t12         = _mm_load_ss(ptrD+8);\
+    _t10         = _mm_sub_ps(_t10,_t14);\
+    _t11         = _mm_sub_ps(_t11,_t16);\
+    _t12         = _mm_sub_ss(_t12,_t19);\
+    _mm_storeu_ps(ptrD,_t10);\
+    _mm_storeu_ps(ptrD+4,_t11);\
+    _mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
@@ -414,8 +480,79 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4,t11);
      _mm_store_ss(ptrD+8,t12);
  }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                               _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+    __m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+    __m128 _t23,_t24;\
+    _t13         = _mm_unpackhi_ps(_x1,_y1);\
+    _x1          = _mm_unpacklo_ps(_x1,_y1);\
+    _t14         = _mm_unpackhi_ps(_z1,_x2);\
+    _z1          = _mm_unpacklo_ps(_z1,_x2);\
+    _t15         = _mm_unpackhi_ps(_y2,_z2);\
+    _y2          = _mm_unpacklo_ps(_y2,_z2);\
+    _t16         = _mm_unpackhi_ps(_x3,_y3);\
+    _x3          = _mm_unpacklo_ps(_x3,_y3);\
+    _t17         = _mm_unpackhi_ps(_z3,_x4);\
+    _z3          = _mm_unpacklo_ps(_z3,_x4);\
+    _t18         = _mm_unpackhi_ps(_y4,_z4);\
+    _y4          = _mm_unpacklo_ps(_y4,_z4);\
+    _t19         = _mm_movelh_ps(_x1,_z1);\
+    _z1          = _mm_movehl_ps(_z1,_x1);\
+    _t20         = _mm_movelh_ps(_t13,_t14);\
+    _t14         = _mm_movehl_ps(_t14,_t13);\
+    _t21         = _mm_movelh_ps(_y2,_x3);\
+    _x3          = _mm_movehl_ps(_x3,_y2);\
+    _t22         = _mm_movelh_ps(_t15,_t16);\
+    _t16         = _mm_movehl_ps(_t16,_t15);\
+    _t23         = _mm_movelh_ps(_z3,_y4);\
+    _y4          = _mm_movehl_ps(_y4,_z3);\
+    _t24         = _mm_movelh_ps(_t17,_t18);\
+    _t18         = _mm_movehl_ps(_t18,_t17);\
+    _t1          = _mm_loadu_ps(ptrA);\
+    _t2          = _mm_loadu_ps(ptrA+4);\
+    _t3          = _mm_loadu_ps(ptrA+8);\
+    _t1          = _mm_sub_ps(_t1,_t19);\
+    _t2          = _mm_sub_ps(_t2,_t21);\
+    _t3          = _mm_sub_ps(_t3,_t23);\
+    _mm_storeu_ps(ptrA,_t1);\
+    _mm_storeu_ps(ptrA+4,_t2);\
+    _mm_storeu_ps(ptrA+8,_t3);\
+    _t4          = _mm_loadu_ps(ptrB);\
+    _t5          = _mm_loadu_ps(ptrB+4);\
+    _t6          = _mm_loadu_ps(ptrB+8);\
+    _t4          = _mm_sub_ps(_t4,_z1);\
+    _t5          = _mm_sub_ps(_t5,_x3);\
+    _t6          = _mm_sub_ps(_t6,_y4);\
+    _mm_storeu_ps(ptrB,_t4);\
+    _mm_storeu_ps(ptrB+4,_t5);\
+    _mm_storeu_ps(ptrB+8,_t6);\
+    _t7          = _mm_loadu_ps(ptrC);\
+    _t8          = _mm_loadu_ps(ptrC+4);\
+    _t9          = _mm_loadu_ps(ptrC+8);\
+    _t7          = _mm_sub_ps(_t7,_t20);\
+    _t8          = _mm_sub_ps(_t8,_t22);\
+    _t9          = _mm_sub_ps(_t9,_t24);\
+    _mm_storeu_ps(ptrC,_t7);\
+    _mm_storeu_ps(ptrC+4,_t8);\
+    _mm_storeu_ps(ptrC+8,_t9);\
+    _t10         = _mm_loadu_ps(ptrD);\
+    _t11         = _mm_loadu_ps(ptrD+4);\
+    _t12         = _mm_loadu_ps(ptrD+8);\
+    _t10         = _mm_sub_ps(_t10,_t14);\
+    _t11         = _mm_sub_ps(_t11,_t16);\
+    _t12         = _mm_sub_ps(_t12,_t18);\
+    _mm_storeu_ps(ptrD,_t10);\
+    _mm_storeu_ps(ptrD+4,_t11);\
+    _mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
@@ -488,7 +625,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4,t11);
      _mm_storeu_ps(ptrD+8,t12);
  }
-
+#endif
  
  static gmx_inline void
  gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
@@ -516,6 +653,38 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4;\
+\
+    fix1 = _mm_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+    fix1 = _mm_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+    _t4 = _mm_load_ss(fshiftptr+2);\
+    _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+    _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+    _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+    _t3 = _mm_permute_ps(_t3  ,_MM_SHUFFLE(1,2,0,0));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _mm_store_ss(fshiftptr+2,_t1);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -554,8 +723,43 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2,t1);
      _mm_storeh_pi((__m64 *)(fshiftptr),t1);
  }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5;\
+\
+    fix1 = _mm_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm_hadd_ps(fiz3,fix4);\
+    fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+    fix1 = _mm_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+    _t5 = _mm_load_ss(fshiftptr+2);\
+    _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+    _t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));\
+    _t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));\
+    _t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));\
+    _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+    _t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _t5 = _mm_add_ps(_t5,_t1);\
+    _mm_store_ss(fshiftptr+2,_t5);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -598,7 +802,7 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2,t5);
      _mm_storeh_pi((__m64 *)(fshiftptr),t5);
  }
-
+#endif
  
  
  static gmx_inline void
@@ -621,22 +825,4 @@ gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_permute_ps(pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_permute_ps(pot1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-    _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-    _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
  #endif /* _kernelutil_x86_avx_128_fma_single_h_ */
diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

index 242260ac86374d045f508a8955cf061856c0049a..c00b6dad84608efb553e9b70546edf65055a9f7c 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
+++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
@@ -211,10 +211,10 @@ gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * g
  
  static gmx_inline void
  gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1,
-                                            __m256d * gmx_restrict y1,
-                                            __m256d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1,
+        __m256d * gmx_restrict y1,
+        __m256d * gmx_restrict z1)
  {
      __m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
  
@@ -238,10 +238,10 @@ gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shif
  
  static gmx_inline void
  gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                            __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                            __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+        __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+        __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
  {
      __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
  
@@ -285,11 +285,11 @@ gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shif
  
  static gmx_inline void
  gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                            const double * gmx_restrict xyz,
-                                            __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                            __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                            __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
-                                            __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
+        __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
+        __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
+        __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
  {
      __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
  
@@ -352,27 +352,6 @@ gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static void
-gmx_mm256_load_2rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3;
-
-    t1            = _mm256_loadu_pd(p1);                         /* x2 z1 | y1 x1 */
-    t2            = _mm256_castpd128_pd256(_mm_loadu_pd(p1+4));  /*  -  - | z2 y2 */
-
-    *x1           = t1;
-    *y2           = t2;
-
-    t3            = gmx_mm256_unpack128hi_pd(t1,t1);
-
-    *z1           = t3;
-    *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
-    *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
-    *x2           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
-}
-
  static void
  gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                       __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
@@ -408,7 +387,7 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      t1            = _mm256_loadu_pd(p1);
      t2            = _mm256_loadu_pd(p1+4);
      t3            = _mm256_loadu_pd(p1+8);
-    
+
      t4            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
      t5            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
      t6            = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
@@ -419,7 +398,7 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z1           = t4;
      *x3           = t5;
      *y4           = t6;
-    
+
      *y1           = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
      *z2           = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
      *x4           = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
@@ -429,128 +408,12 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static void
-gmx_mm256_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
-{
-    __m256d tA,tB,tC;
-
-    tA           = _mm256_loadu_pd(ptrA); /*  - z1 | y1 x1 */
-    tB           = _mm256_loadu_pd(ptrB); /*  - z2 | y2 x2 */
-
-    tC           = _mm256_unpacklo_pd(tA,tB);  /* z2 z1 | x2 x1 */
-
-    *x1          = tC;
-    *y1          = _mm256_unpackhi_pd(tA,tB);
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(tC,0x1));
-}
-
-
-static void
-gmx_mm256_load_2rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3,t4,t5;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));        /*   -   -  | z2a y2a */
-    t4           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));        /*   -   -  | z2b y2b */
-    
-    t5           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-    *y2          = _mm256_unpacklo_pd(t3,t4);      /*   -   -  | y2b y2a */
-    *z2          = _mm256_unpackhi_pd(t3,t4);      /*   -   -  | z2b z2a */
-    *x1          = t5;
-    *y1          = t1;
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-}
-
-
-static void
-gmx_mm256_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                     __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
-    t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
-    t5           = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8));        /*   -   -  |  -  z3a */
-    t6           = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8));        /*   -   -  |  -  z3b */
-
-    t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-
-    t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
-    t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
-
-    *z3          = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | z3b z3a */
-
-    *x1          = t7;
-    *y1          = t1;
-    *y2          = t2;
-    *z2          = t3;
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-    *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
-    *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-}
-
-
-static void
-gmx_mm256_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
-                                     __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
-                                     __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1           = _mm256_loadu_pd(ptrA);          /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);          /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrA+4);        /*  y3a x3a | z2a y2a */
-    t4           = _mm256_loadu_pd(ptrB+4);        /*  y3b x3b | z2b y2b */
-    t5           = _mm256_loadu_pd(ptrA+8);        /*  z4a y4a | x4a z3a */
-    t6           = _mm256_loadu_pd(ptrB+8);        /*  z4b y4b | x4b z3b */
-
-    t7           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t1           = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-
-    t2           = _mm256_unpacklo_pd(t3,t4);      /*  x3b x3a | y2b y2a */
-    t3           = _mm256_unpackhi_pd(t3,t4);      /*  y3b y3a | z2b z2a */
-
-    t4           = _mm256_unpacklo_pd(t5,t6);      /*  y4b y4a | z3b z3a */
-    t5           = _mm256_unpackhi_pd(t5,t6);      /*  z4b z4a | x4b x4a */
-
-    *x1          = t7;
-    *y1          = t1;
-    *y2          = t2;
-    *z2          = t3;
-    *z3          = t4;
-    *x4          = t5;
-
-    *z1          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
-    *x2          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
-    *x3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
-    *y3          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
-    *y4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t4,0x1));;
-    *z4          = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));
-}
-
-
-
  static void
  gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                       const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
                                       __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
  {
-     __m256d t1,t2,t3,t4,t5,t6;
+    __m256d t1,t2,t3,t4,t5,t6;
  
      t1           = _mm256_loadu_pd(ptrA);        /*   -  z1a | y1a x1a */
      t2           = _mm256_loadu_pd(ptrB);        /*   -  z1b | y1b x1b */
@@ -567,40 +430,6 @@ gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
      *z1          = gmx_mm256_unpack128hi_pd(t5,t1);
  }
  
-static void
-gmx_mm256_load_2rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
-                                     const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
-                                     __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-                                     __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
-
-    t1           = _mm256_loadu_pd(ptrA);        /*  x2a z1a | y1a x1a */
-    t2           = _mm256_loadu_pd(ptrB);        /*  x2b z1b | y1b x1b */
-    t3           = _mm256_loadu_pd(ptrC);        /*  x2c z1c | y1c x1c */
-    t4           = _mm256_loadu_pd(ptrD);        /*  x2d z1d | y1d x1d */
-    t5           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4));      /*   -   -  | z2a y2a */
-    t6           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4));      /*   -   -  | z2b y2b */
-    t7           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrC+4));      /*   -   -  | z2c y2c */
-    t8           = _mm256_castpd128_pd256(_mm_loadu_pd(ptrD+4));      /*   -   -  | z2d y2d */
-
-    t9           = _mm256_unpacklo_pd(t1,t2);      /*  z1b z1a | x1b x1a */
-    t10          = _mm256_unpackhi_pd(t1,t2);      /*  x2b x2a | y1b y1a */
-    t1           = _mm256_unpacklo_pd(t3,t4);      /*  z1d z1c | x1d x1c */
-    t2           = _mm256_unpackhi_pd(t3,t4);      /*  x2d x2c | y1d y1c */
-    t3           = _mm256_unpacklo_pd(t5,t6);      /*   -   -  | y2b y2a */
-    t4           = _mm256_unpackhi_pd(t5,t6);      /*   -   -  | z2b z2a */
-    t5           = _mm256_unpacklo_pd(t7,t8);      /*   -   -  | y2d y2c */
-    t6           = _mm256_unpackhi_pd(t7,t8);      /*   -   -  | z2d z2c */
-
-    *x1          = gmx_mm256_unpack128lo_pd(t9,t1);
-    *y1          = gmx_mm256_unpack128lo_pd(t10,t2);
-    *z1          = gmx_mm256_unpack128hi_pd(t9,t1);
-
-    *x2          = gmx_mm256_unpack128hi_pd(t10,t2);
-    *y2          = gmx_mm256_unpack128lo_pd(t3,t5);
-    *z2          = gmx_mm256_unpack128lo_pd(t4,t6);
-}
  
  
  static void
@@ -705,375 +534,10 @@ gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
  
  
  
-/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
-{
-    __m256d t1,t2;
-
-    t1  = _mm256_loadu_pd(ptrA);
-    t2  = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
-    t1  = _mm256_sub_pd(t1,t2);
-    /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
-    _mm256_storeu_pd(ptrA,t1);
-}
-
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                            __m256d xyz1, __m256d xyz2, __m256d xyz3)
-{
-    __m256d t1,t2;
-    __m256d tA,tB;
-    __m128d tC;
-
-    tA   = _mm256_loadu_pd(ptrA);
-    tB   = _mm256_loadu_pd(ptrA+4);
-    tC   = _mm_load_sd(ptrA+8);
-
-    /* xyz1:  -  z1 | y1 x1 */
-    /* xyz2:  -  z2 | y2 x2 */
-    /* xyz3:  -  z3 | y3 x3 */
-
-    xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
-    t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
-    xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
-    xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
-    t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
-    xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
-
-    tA   = _mm256_sub_pd(tA,xyz1);
-    tB   = _mm256_sub_pd(tB,xyz2);
-    tC   = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
-
-    _mm256_storeu_pd(ptrA,tA);
-    _mm256_storeu_pd(ptrA+4,tB);
-    _mm_store_sd(ptrA+8,tC);
-}
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                            __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
-{
-    __m256d t1,t2,t3;
-    __m256d tA,tB,tC;
-
-    tA   = _mm256_loadu_pd(ptrA);
-    tB   = _mm256_loadu_pd(ptrA+4);
-    tC   = _mm256_loadu_pd(ptrA+8);
-
-    /* xyz1:  -  z1 | y1 x1 */
-    /* xyz2:  -  z2 | y2 x2 */
-    /* xyz3:  -  z3 | y3 x3 */
-    /* xyz4:  -  z4 | y4 x4 */
-
-    xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /*  z2 -  | x2 y2 */
-    t1   = _mm256_permute2f128_pd(xyz2,xyz2,0x21);   /* x2 y2 | z2 -  | */
-    xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
-    xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /*  -  - | z2 y2 */
-    t2   = _mm256_permute2f128_pd(xyz3,xyz3,0x21);   /* y3 x3 |  -  z3 | */
-    xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /*  y3 x3 | z2 y2 */
-    xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1));  /*  z4 -  | x4 y4 */
-    t3   = _mm256_permute2f128_pd(xyz4,xyz4,0x21);    /*  x4 y4 | z4 - */
-    t3   = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
-    xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /*  xz y4 | x4 z3 */
-
-    tA   = _mm256_sub_pd(tA,xyz1);
-    tB   = _mm256_sub_pd(tB,xyz2);
-    tC   = _mm256_sub_pd(tC,xyz4);
-
-    _mm256_storeu_pd(ptrA,tA);
-    _mm256_storeu_pd(ptrA+4,tB);
-    _mm256_storeu_pd(ptrA+8,tC);
-}
-
-
-
-static void
-gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1)
-{
-    __m128d t1,t2,t3;
-
-    t1           = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
-    t2           = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
-    t3           = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
-    _mm_store_sd(ptrA,t1);
-    _mm_store_sd(ptrA+1,t2);
-    _mm_store_sd(ptrA+2,t3);
-}
-
-
-static void
-gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1;
-    __m128d tA;
-    t1          = _mm256_loadu_pd(ptrA);
-    tA          = _mm_loadu_pd(ptrA+4);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1);  /* x2a z1a | y1a x1a */
-
-    t1          = _mm256_sub_pd(x1,t1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm_storeu_pd(ptrA+4,tA);
-}
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
-{
-    __m256d t1,t2;
-    __m128d tA;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrA+4);
-    tA          = _mm_load_sd(ptrA+8);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-    y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
-    t1          = _mm256_sub_pd(t1,x1);
-    t2          = _mm256_sub_pd(t2,y2);
-    tA          = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrA+4,t2);
-    _mm_store_sd(ptrA+8,tA);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
-{
-    __m256d t1,t2,t3;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrA+4);
-    t3          = _mm256_loadu_pd(ptrA+8);
-
-    x1          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    z1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    y2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    x3          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    z3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
-    y4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
-
-    x1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-    y2          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
-    z3          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
-
-    t1          = _mm256_sub_pd(t1,x1);
-    t2          = _mm256_sub_pd(t2,y2);
-    t3          = _mm256_sub_pd(t3,z3);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrA+4,t2);
-    _mm256_storeu_pd(ptrA+8,t3);
-}
-
-static void
-gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
-                                          double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1)
-{
-    __m256d t1,t2,t3,t4;
-    __m256i mask;
-
-    t3          = _mm256_loadu_pd(ptrA);
-    t4          = _mm256_loadu_pd(ptrB);
-
-    t1          = _mm256_unpacklo_pd(x1,y1);   /*  -  - | y1a x1a */
-    t2          = _mm256_unpackhi_pd(x1,y1);   /*  -  - | y1b x1b */
-
-    t1          = gmx_mm256_unpack128lo_pd(t1,z1); /*  -  z1a | y1a x1a */
-    z1          = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
-    t2          = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
-
-    /* Construct a mask without executing any data loads */
-    mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
-                                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
-
-    t3          = _mm256_sub_pd(t3,t1);
-    t4          = _mm256_sub_pd(t4,t2);
-
-    /* Careful with potentially overlapping stores, need to be masked */
-    _mm256_maskstore_pd(ptrA,mask,t3);
-    _mm256_maskstore_pd(ptrB,mask,t4);
-}
-
-static void
-gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1,t2,t5;
-    __m128d t3,t4;
-
-    t1          = _mm256_loadu_pd(ptrA); 
-    t2          = _mm256_loadu_pd(ptrB); 
-    t3          = _mm_loadu_pd(ptrA+4);
-    t4          = _mm_loadu_pd(ptrB+4);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t1          = _mm256_sub_pd(t1,z2);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
-    t4          = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
-
-    /* Careful with potentially overlapping stores, need to be masked */
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm_storeu_pd(ptrA+4,t3);
-    _mm_storeu_pd(ptrB+4,t4);
-}
-
-static void
-gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
-{
-    __m256d t1,t2,t3,t4,t5,t6;
-    __m128d tA,tB;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB);
-    t3          = _mm256_loadu_pd(ptrA+4);
-    t4          = _mm256_loadu_pd(ptrB+4);
-    tA          = _mm_load_sd(ptrA+8);
-    tB          = _mm_load_sd(ptrB+8);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
-
-    t6          = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
-
-    y3          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t5          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */     
-    x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
-    t1          = _mm256_sub_pd(t1,y3);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm256_sub_pd(t3,t5);  
-    t4          = _mm256_sub_pd(t4,x1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
-    tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrA+4,t3);
-    _mm256_storeu_pd(ptrB+4,t4);
-    _mm_store_sd(ptrA+8,tA);
-    _mm_store_sd(ptrB+8,tB);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
-{
-    __m256d t1,t2,t3,t4,t5,t6,t7;
-
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB); 
-    t3          = _mm256_loadu_pd(ptrA+4);
-    t4          = _mm256_loadu_pd(ptrB+4);
-    t5          = _mm256_loadu_pd(ptrA+8);
-    t6          = _mm256_loadu_pd(ptrB+8);
-
-    t7          = _mm256_unpacklo_pd(x1,y1); /*  -   -  | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /*  -   -  | y1b x1b */
-
-    y1          = _mm256_unpacklo_pd(z1,x2); /*  -   -  | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /*  -   -  | x2b z1b */
-
-    x2          = _mm256_unpacklo_pd(y2,z2); /*  -   -  | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /*  -   -  | z2b y2b */
-
-    z2          = _mm256_unpacklo_pd(x3,y3); /*  -   -  | y3a x3a */
-    x3          = _mm256_unpackhi_pd(x3,y3); /*  -   -  | y3b x3b */
-
-    y3          = _mm256_unpacklo_pd(z3,x4); /*  -   -  | x4a z3a */
-    z3          = _mm256_unpackhi_pd(z3,x4); /*  -   -  | x4b z3b */
-    x4          = _mm256_unpacklo_pd(y4,z4); /*  -   -  | z4a y4a */
-    y4          = _mm256_unpackhi_pd(y4,z4); /*  -   -  | z4b y4b */
-
-    z4          = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
-    y1          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
-    t7          = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
-    x1          = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
-    x2          = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
-    y2          = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
-
-    t1          = _mm256_sub_pd(t1,z4);
-    t2          = _mm256_sub_pd(t2,y1);
-    t3          = _mm256_sub_pd(t3,t7);
-    t4          = _mm256_sub_pd(t4,x1);
-    t5          = _mm256_sub_pd(t5,x2);
-    t6          = _mm256_sub_pd(t6,y2);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrA+4,t3);
-    _mm256_storeu_pd(ptrB+4,t4);
-    _mm256_storeu_pd(ptrA+8,t5);
-    _mm256_storeu_pd(ptrB+8,t6);
-}
-
-
-
  static void
  gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1)
  {
      __m256d t1,t2,tA,tB,tC,tD;
      __m256i mask;
@@ -1088,7 +552,7 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
  
      /* Construct a mask without executing any data loads */
      mask        = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
-                                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
+                                      _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
  
      tA          = _mm256_loadu_pd(ptrA);
      tB          = _mm256_loadu_pd(ptrB);
@@ -1106,65 +570,77 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
      _mm256_maskstore_pd(ptrD,mask,tD);
  }
  
-static void
-gmx_mm256_decrement_2rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2)
-{
-    __m256d t1,t2,t3,t4,t5,t6;
-    __m128d tA,tB,tC,tD,tE,tF;
  
-    t1          = _mm256_loadu_pd(ptrA);
-    t2          = _mm256_loadu_pd(ptrB);
-    t3          = _mm256_loadu_pd(ptrC);
-    t4          = _mm256_loadu_pd(ptrD);
-    tA          = _mm_loadu_pd(ptrA+4);
-    tB          = _mm_loadu_pd(ptrB+4);
-    tC          = _mm_loadu_pd(ptrC+4);
-    tD          = _mm_loadu_pd(ptrD+4);
-
-    t5          = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
-    x1          = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
-    y1          = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
-    z1          = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
-    x2          = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
-    y2          = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
  
-    t6          = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
-    z2          = gmx_mm256_unpack128hi_pd(t5,y1); /* x2c z1c | y1c x1c */
-    t5          = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-    y1          = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
-
-    tE          = _mm256_extractf128_pd(x2,0x1); /* z2c y2c */
-    tF          = _mm256_extractf128_pd(y2,0x1); /* z2d y2d */
-
-    t1          = _mm256_sub_pd(t1,t6);
-    t2          = _mm256_sub_pd(t2,t5);
-    t3          = _mm256_sub_pd(t3,z2);
-    t4          = _mm256_sub_pd(t4,y1);
-    tA          = _mm_sub_pd(tA,_mm256_castpd256_pd128(x2));
-    tB          = _mm_sub_pd(tB,_mm256_castpd256_pd128(y2));
-    tC          = _mm_sub_pd(tC,tE);
-    tD          = _mm_sub_pd(tD,tF);
-
-    _mm256_storeu_pd(ptrA,t1);
-    _mm256_storeu_pd(ptrB,t2);
-    _mm256_storeu_pd(ptrC,t3);
-    _mm256_storeu_pd(ptrD,t4);
-    _mm_storeu_pd(ptrA+4,tA);
-    _mm_storeu_pd(ptrB+4,tB);
-    _mm_storeu_pd(ptrC+4,tC);
-    _mm_storeu_pd(ptrD+4,tD);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128d _tA,_tB,_tC,_tD,_tE;\
+    _t1          = _mm256_loadu_pd(ptrA);\
+    _t2          = _mm256_loadu_pd(ptrB);\
+    _t3          = _mm256_loadu_pd(ptrC);\
+    _t4          = _mm256_loadu_pd(ptrD);\
+    _t5          = _mm256_loadu_pd(ptrA+4);\
+    _t6          = _mm256_loadu_pd(ptrB+4);\
+    _t7          = _mm256_loadu_pd(ptrC+4);\
+    _t8          = _mm256_loadu_pd(ptrD+4);\
+    _tA          = _mm_load_sd(ptrA+8);\
+    _tB          = _mm_load_sd(ptrB+8);\
+    _tC          = _mm_load_sd(ptrC+8);\
+    _tD          = _mm_load_sd(ptrD+8);\
+    _t9          = _mm256_unpacklo_pd(_x1,_y1);\
+    _x1          = _mm256_unpackhi_pd(_x1,_y1);\
+    _y1          = _mm256_unpacklo_pd(_z1,_x2);\
+    _z1          = _mm256_unpackhi_pd(_z1,_x2);\
+    _x2          = _mm256_unpacklo_pd(_y2,_z2);\
+    _y2          = _mm256_unpackhi_pd(_y2,_z2);\
+    _z2          = _mm256_unpacklo_pd(_x3,_y3);\
+    _x3          = _mm256_unpackhi_pd(_x3,_y3);\
+    _t10         = gmx_mm256_unpack128lo_pd(_t9,_y1);\
+    _y3          = gmx_mm256_unpack128hi_pd(_t9,_y1);\
+    _t9          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+    _y1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+    _x1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+    _z1          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+    _x2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+    _z2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+    _t1          = _mm256_sub_pd(_t1,_t10);\
+    _t2          = _mm256_sub_pd(_t2,_t9);\
+    _t3          = _mm256_sub_pd(_t3,_y3);\
+    _t4          = _mm256_sub_pd(_t4,_y1);\
+    _t5          = _mm256_sub_pd(_t5,_x1);\
+    _t6          = _mm256_sub_pd(_t6,_x2);\
+    _t7          = _mm256_sub_pd(_t7,_z1);\
+    _t8          = _mm256_sub_pd(_t8,_z2);\
+    _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3));\
+    _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3),_GMX_MM_PERMUTE128D(1,1)));\
+    _tE          = _mm256_extractf128_pd(_z3,0x1);\
+    _tC          = _mm_sub_sd(_tC, _tE);\
+    _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE,_GMX_MM_PERMUTE128D(1,1)));\
+    _mm256_storeu_pd(ptrA,_t1);\
+    _mm256_storeu_pd(ptrB,_t2);\
+    _mm256_storeu_pd(ptrC,_t3);\
+    _mm256_storeu_pd(ptrD,_t4);\
+    _mm256_storeu_pd(ptrA+4,_t5);\
+    _mm256_storeu_pd(ptrB+4,_t6);\
+    _mm256_storeu_pd(ptrC+4,_t7);\
+    _mm256_storeu_pd(ptrD+4,_t8);\
+    _mm_store_sd(ptrA+8,_tA);\
+    _mm_store_sd(ptrB+8,_tB);\
+    _mm_store_sd(ptrC+8,_tC);\
+    _mm_store_sd(ptrD+8,_tD);\
  }
-
-
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1,
+        __m256d x2, __m256d y2, __m256d z2,
+        __m256d x3, __m256d y3, __m256d z3)
  {
      __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
      __m128d tA,tB,tC,tD,tE;
@@ -1235,15 +711,85 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
      _mm_store_sd(ptrC+8,tC);
      _mm_store_sd(ptrD+8,tD);
  }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA,ptrB,ptrC,ptrD, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{ \
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14;\
+    __m128d _tA,_tB,_tC,_tD,_tE;\
+    _t1          = _mm256_loadu_pd(ptrA);\
+    _t2          = _mm256_loadu_pd(ptrB);\
+    _t3          = _mm256_loadu_pd(ptrC);\
+    _t4          = _mm256_loadu_pd(ptrD);\
+    _t5          = _mm256_loadu_pd(ptrA+4);\
+    _t6          = _mm256_loadu_pd(ptrB+4);\
+    _t7          = _mm256_loadu_pd(ptrC+4);\
+    _t8          = _mm256_loadu_pd(ptrD+4);\
+    _t9          = _mm256_loadu_pd(ptrA+8);\
+    _t10         = _mm256_loadu_pd(ptrB+8);\
+    _t11         = _mm256_loadu_pd(ptrC+8);\
+    _t12         = _mm256_loadu_pd(ptrD+8);\
+    _t13         = _mm256_unpacklo_pd(_x1,_y1);\
+    _x1          = _mm256_unpackhi_pd(_x1,_y1);\
+    _y1          = _mm256_unpacklo_pd(_z1,_x2);\
+    _z1          = _mm256_unpackhi_pd(_z1,_x2);\
+    _x2          = _mm256_unpacklo_pd(_y2,_z2);\
+    _y2          = _mm256_unpackhi_pd(_y2,_z2);\
+    _z2          = _mm256_unpacklo_pd(_x3,_y3);\
+    _x3          = _mm256_unpackhi_pd(_x3,_y3);\
+    _y3          = _mm256_unpacklo_pd(_z3,_x4);\
+    _z3          = _mm256_unpackhi_pd(_z3,_x4);\
+    _x4          = _mm256_unpacklo_pd(_y4,_z4);\
+    _y4          = _mm256_unpackhi_pd(_y4,_z4);\
+    _z4          = gmx_mm256_unpack128lo_pd(_t13,_y1);\
+    _t13         = gmx_mm256_unpack128hi_pd(_t13,_y1);\
+    _y1          = gmx_mm256_unpack128lo_pd(_x1,_z1);\
+    _x1          = gmx_mm256_unpack128hi_pd(_x1,_z1);\
+    _z1          = gmx_mm256_unpack128lo_pd(_x2,_z2);\
+    _x2          = gmx_mm256_unpack128hi_pd(_x2,_z2);\
+    _z2          = gmx_mm256_unpack128lo_pd(_y2,_x3);\
+    _y2          = gmx_mm256_unpack128hi_pd(_y2,_x3);\
+    _x3          = gmx_mm256_unpack128lo_pd(_y3,_x4);\
+    _y3          = gmx_mm256_unpack128hi_pd(_y3,_x4);\
+    _x4          = gmx_mm256_unpack128lo_pd(_z3,_y4);\
+    _z3          = gmx_mm256_unpack128hi_pd(_z3,_y4);\
+    _t1          = _mm256_sub_pd(_t1,_z4);\
+    _t2          = _mm256_sub_pd(_t2,_y1);\
+    _t3          = _mm256_sub_pd(_t3,_t13);\
+    _t4          = _mm256_sub_pd(_t4,_x1);\
+    _t5          = _mm256_sub_pd(_t5,_z1);\
+    _t6          = _mm256_sub_pd(_t6,_z2);\
+    _t7          = _mm256_sub_pd(_t7,_x2);\
+    _t8          = _mm256_sub_pd(_t8,_y2);\
+    _t9          = _mm256_sub_pd(_t9,_x3);\
+    _t10         = _mm256_sub_pd(_t10,_x4);\
+    _t11         = _mm256_sub_pd(_t11,_y3);\
+    _t12         = _mm256_sub_pd(_t12,_z3);\
+    _mm256_storeu_pd(ptrA,_t1);\
+    _mm256_storeu_pd(ptrB,_t2);\
+    _mm256_storeu_pd(ptrC,_t3);\
+    _mm256_storeu_pd(ptrD,_t4);\
+    _mm256_storeu_pd(ptrA+4,_t5);\
+    _mm256_storeu_pd(ptrB+4,_t6);\
+    _mm256_storeu_pd(ptrC+4,_t7);\
+    _mm256_storeu_pd(ptrD+4,_t8);\
+    _mm256_storeu_pd(ptrA+8,_t9);\
+    _mm256_storeu_pd(ptrB+8,_t10);\
+    _mm256_storeu_pd(ptrC+8,_t11);\
+    _mm256_storeu_pd(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
-                                          double * gmx_restrict ptrC, double * gmx_restrict ptrD,
-                                          __m256d x1, __m256d y1, __m256d z1,
-                                          __m256d x2, __m256d y2, __m256d z2,
-                                          __m256d x3, __m256d y3, __m256d z3,
-                                          __m256d x4, __m256d y4, __m256d z4)
+        double * gmx_restrict ptrC, double * gmx_restrict ptrD,
+        __m256d x1, __m256d y1, __m256d z1,
+        __m256d x2, __m256d y2, __m256d z2,
+        __m256d x3, __m256d y3, __m256d z3,
+        __m256d x4, __m256d y4, __m256d z4)
  {
      __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
      __m128d tA,tB,tC,tD,tE;
@@ -1314,6 +860,7 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
      _mm256_storeu_pd(ptrC+8,t11);
      _mm256_storeu_pd(ptrD+8,t12);
  }
+#endif
  
  
  
@@ -1321,8 +868,8 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
  
  static gmx_inline void
  gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
  {
      __m256d t1,t2;
      __m128d tA,tB;
@@ -1345,63 +892,59 @@ gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
      _mm256_storeu_pd(fshiftptr,t2);
  }
  
-static gmx_inline void
-gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
-{
-    __m256d t1,t2,t3;
-    __m128d tA,tB,tC,tD,tE;
  
-    fix1 = _mm256_hadd_pd(fix1,fiy1);
-    fiz1 = _mm256_hadd_pd(fiz1,fix2);
-    fiy2 = _mm256_hadd_pd(fiy2,fiz2);
  
-    /* Add across the two lanes by swapping and adding back */
-    tA   = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1)); /* fiy1 fix1 */
-    tB   = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
-    tC   = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
-    
-    t1   = gmx_mm256_set_m128d(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
-
-    t2   = _mm256_loadu_pd(fptr);
-    tD   = _mm_loadu_pd(fptr+4);
-
-    t2   = _mm256_add_pd(t2,t1);
-    tD   = _mm_add_pd(tD,tC);
-    _mm256_storeu_pd(fptr,t2);
-    _mm_storeu_pd(fptr+4,tD);
-
-    /* Add up shift force */
-    /* t1:  fix2 fiz1 | fiy1 fix1 */
-    /* tC:              fiz2 fiy2 */
-
-    tA   = _mm256_extractf128_pd(t1,0x1); /* fix2 fiz1 */
-    tB   = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1));   /* fiy2 fix2 */
-    tC   = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1));      /*  -   fiz2 */
-    
-    tB   = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));
-    tC   = _mm_add_sd(tC,tA);
-
-    tD   = _mm_loadu_pd(fshiftptr);
-    tE   = _mm_load_sd(fshiftptr+2);
-
-    tD   = _mm_add_pd(tD,tB);
-    tE   = _mm_add_pd(tE,tC);
-
-    _mm_storeu_pd(fshiftptr,tD);
-    _mm_store_sd(fshiftptr+2,tE);
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{ \
+    __m256d _t1,_t2,_t3,_t4;\
+    __m128d _tz3,_tA,_tB,_tC,_tD;\
+    fix1 = _mm256_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm256_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd());\
+    _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+    _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+    _t1   = _mm256_add_pd(_t1,_t2);\
+    _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+    _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+    _t3   = _mm256_add_pd(_t3,_t4);\
+    _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1));\
+    _t2   = _mm256_loadu_pd(fptr);\
+    _t4   = _mm256_loadu_pd(fptr+4);\
+    _tA   = _mm_load_sd(fptr+8);\
+    _t2   = _mm256_add_pd(_t2,_t1);\
+    _t4   = _mm256_add_pd(_t4,_t3);\
+    _tA   = _mm_add_sd(_tA,_tz3);\
+    _mm256_storeu_pd(fptr,_t2);\
+    _mm256_storeu_pd(fptr+4,_t4);\
+    _mm_store_sd(fptr+8,_tA);\
+    _tB   = _mm256_extractf128_pd(_t1,0x1);\
+    _tC   = _mm256_extractf128_pd(_t3,0x1);\
+    _tz3  = _mm_add_sd(_tz3,_tB);\
+    _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3),_GMX_MM_PERMUTE128D(1,1));\
+    _tz3  = _mm_add_sd(_tz3,_tD);\
+    _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t1));\
+    _tD   = _mm_shuffle_pd(_tB,_mm256_castpd256_pd128(_t3),_MM_SHUFFLE2(0,1));\
+    _tC   = _mm_add_pd(_tC,_tD);\
+    _tA   = _mm_loadu_pd(fshiftptr);\
+    _tB   = _mm_load_sd(fshiftptr+2);\
+    _tA   = _mm_add_pd(_tA,_tC);\
+    _tB   = _mm_add_sd(_tB,_tz3);\
+    _mm_storeu_pd(fshiftptr,_tA);\
+    _mm_store_sd(fshiftptr+2,_tB);\
  }
-
-
-
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         __m256d fix3, __m256d fiy3, __m256d fiz3,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        __m256d fix2, __m256d fiy2, __m256d fiz2,
+        __m256d fix3, __m256d fiy3, __m256d fiz3,
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
  {
      __m256d t1,t2,t3,t4;
      __m128d tz3,tA,tB,tC,tD;
@@ -1459,15 +1002,66 @@ gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
      _mm_storeu_pd(fshiftptr,tA);
      _mm_store_sd(fshiftptr+2,tB);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m256d _t1,_t2,_t3,_t4,_t5,_t6;\
+    __m128d _tA,_tB,_tC,_tD;\
+    fix1 = _mm256_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm256_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm256_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm256_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm256_hadd_pd(fiz3,fix4);\
+    fiy4 = _mm256_hadd_pd(fiy4,fiz4);\
+    _t1   = gmx_mm256_unpack128lo_pd(fix1,fiz1);\
+    _t2   = gmx_mm256_unpack128hi_pd(fix1,fiz1);\
+    _t1   = _mm256_add_pd(_t1,_t2);\
+    _t3   = gmx_mm256_unpack128lo_pd(fiy2,fix3);\
+    _t4   = gmx_mm256_unpack128hi_pd(fiy2,fix3);\
+    _t3   = _mm256_add_pd(_t3,_t4);\
+    _t5   = gmx_mm256_unpack128lo_pd(fiz3,fiy4);\
+    _t6   = gmx_mm256_unpack128hi_pd(fiz3,fiy4);\
+    _t5   = _mm256_add_pd(_t5,_t6);\
+    _t2   = _mm256_loadu_pd(fptr);\
+    _t4   = _mm256_loadu_pd(fptr+4);\
+    _t6   = _mm256_loadu_pd(fptr+8);\
+    _t2   = _mm256_add_pd(_t2,_t1);\
+    _t4   = _mm256_add_pd(_t4,_t3);\
+    _t6   = _mm256_add_pd(_t6,_t5);\
+    _mm256_storeu_pd(fptr,_t2);\
+    _mm256_storeu_pd(fptr+4,_t4);\
+    _mm256_storeu_pd(fptr+8,_t6);\
+    _tA   = _mm256_extractf128_pd(_t1,0x1);\
+    _tB   = _mm256_extractf128_pd(_t3,0x1);\
+    _tC   = _mm256_extractf128_pd(_t5,0x1);\
+    _tB   = _mm_add_pd(_tB,_mm256_castpd256_pd128(_t1));\
+    _tA   = _mm_add_pd(_tA,_mm256_castpd256_pd128(_t5));\
+    _tC   = _mm_add_pd(_tC,_mm256_castpd256_pd128(_t3));\
+    _tD   = _mm_shuffle_pd(_tA,_tC,_MM_SHUFFLE2(0,1));\
+    _tB   = _mm_add_pd(_tB,_tD);\
+    _tC   = _mm_permute_pd(_tC,_GMX_MM_PERMUTE128D(1,1));\
+    _tC   = _mm_add_sd(_tC,_tA);\
+    _tA   = _mm_loadu_pd(fshiftptr);\
+    _tD   = _mm_load_sd(fshiftptr+2);\
+    _tA   = _mm_add_pd(_tA,_tB);\
+    _tD   = _mm_add_sd(_tD,_tC);\
+    _mm_storeu_pd(fshiftptr,_tA);\
+    _mm_store_sd(fshiftptr+2,_tD);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
-                                         __m256d fix2, __m256d fiy2, __m256d fiz2,
-                                         __m256d fix3, __m256d fiy3, __m256d fiz3,
-                                         __m256d fix4, __m256d fiy4, __m256d fiz4,
-                                         double * gmx_restrict fptr,
-                                         double * gmx_restrict fshiftptr)
+        __m256d fix2, __m256d fiy2, __m256d fiz2,
+        __m256d fix3, __m256d fiy3, __m256d fiz3,
+        __m256d fix4, __m256d fiy4, __m256d fiz4,
+        double * gmx_restrict fptr,
+        double * gmx_restrict fshiftptr)
  {
      __m256d t1,t2,t3,t4,t5,t6;
      __m128d tA,tB,tC,tD;
@@ -1530,6 +1124,7 @@ gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
      _mm_storeu_pd(fshiftptr,tA);
      _mm_store_sd(fshiftptr+2,tD);
  }
+#endif
  
  
  
@@ -1547,7 +1142,7 @@ gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
  
  static void
  gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
-                      __m256d pot2, double * gmx_restrict ptrB)
+                         __m256d pot2, double * gmx_restrict ptrB)
  {
      __m128d t1,t2;
  
@@ -1561,49 +1156,4 @@ gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
  }
  
  
-static void
-gmx_mm256_update_4pot_pd(__m256d pot1, double * gmx_restrict ptrA,
-                         __m256d pot2, double * gmx_restrict ptrB,
-                         __m256d pot3, double * gmx_restrict ptrC,
-                         __m256d pot4, double * gmx_restrict ptrD)
-{
-    __m256d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF,tG,tH;
-
-    tA   = _mm_load_sd(ptrA);
-    tB   = _mm_load_sd(ptrB);
-    tC   = _mm_load_sd(ptrC);
-    tD   = _mm_load_sd(ptrD);
-
-    /* do a transpose */
-    t1   = _mm256_unpacklo_pd(pot1, pot2);   /* p2c p1c | p2a p1a */
-    t2   = _mm256_unpackhi_pd(pot1, pot2);   /* p2d p1d | p2b p1b */
-    t3   = _mm256_unpacklo_pd(pot3, pot4);   /* p4c p3c | p4a p3a */
-    t4   = _mm256_unpackhi_pd(pot3, pot4);   /* p4d p3d | p4b p3b */
-    pot1 = _mm256_permute2f128_pd(t1, t3, 0x20);   /* p4a p3a | p2a p1a */
-    pot2 = _mm256_permute2f128_pd(t2, t4, 0x20);   /* p4b p3b | p2b p1b */
-    pot3 = _mm256_permute2f128_pd(t1, t3, 0x31);   /* p4c p3c | p2c p1c */
-    pot4 = _mm256_permute2f128_pd(t2, t4, 0x31);   /* p4d p3d | p2d p1d */
-
-    pot1 = _mm256_add_pd(pot1,pot2);
-    pot3 = _mm256_add_pd(pot3,pot4);
-    pot1 = _mm256_add_pd(pot1,pot3);  /* Sum in the four elements */
-
-    tE   = _mm256_castpd256_pd128(pot1);
-    tF   = _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1));
-    tG   = _mm256_extractf128_pd(pot1,0x1);
-    tH   = _mm_permute_pd(tG,_GMX_MM_PERMUTE128D(1,1));
-
-    tA   = _mm_add_sd(tA,tE);
-    tB   = _mm_add_sd(tB,tF);
-    tC   = _mm_add_sd(tC,tG);
-    tD   = _mm_add_sd(tD,tH);
-
-       _mm_store_sd(ptrA,tA);
-       _mm_store_sd(ptrB,tB);
-       _mm_store_sd(ptrC,tC);
-       _mm_store_sd(ptrD,tD);
-}
-
-
  #endif /* _kernelutil_x86_avx_256_double_h_ */
diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h

index c35f1a409ea45261c163c1df97da8f047131d0a1..f3a1f6740a1b5f7d512fde1006d0b8f0005290e7 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
+++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
@@ -199,10 +199,10 @@ gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx
  
  static gmx_inline void
  gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1,
-                                            __m256 * gmx_restrict y1,
-                                            __m256 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1,
+        __m256 * gmx_restrict y1,
+        __m256 * gmx_restrict z1)
  {
      __m128 t1,t2,t3,t4;
  
@@ -225,10 +225,10 @@ gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift
  
  static gmx_inline void
  gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-                                            __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
-                                            __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+        __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+        __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9;
@@ -273,11 +273,11 @@ gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift
  
  static gmx_inline void
  gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                            const float * gmx_restrict xyz,
-                                            __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-                                            __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
-                                            __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
-                                            __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
+        __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
+        __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
+        __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
@@ -503,7 +503,7 @@ gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  
      t1           = _mm256_unpacklo_ps(t1,t3);  /*  -   -  z3g z3e |  -   -  z3c z3a */
      t2           = _mm256_unpacklo_ps(t2,t4);  /*  -   -  z3h z3f |  -   -  z3d z3b */
-    
+
      *z3          = _mm256_unpacklo_ps(t1,t2);
  }
  
@@ -567,7 +567,7 @@ gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
      t6           = _mm256_unpackhi_ps(t1,t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
      t7           = _mm256_unpacklo_ps(t3,t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
      t8           = _mm256_unpackhi_ps(t3,t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
-    
+
      *z3          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
      *x4          = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
      *y4          = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
@@ -577,8 +577,8 @@ gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  
  static gmx_inline void
  gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC,float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1)
+        float * gmx_restrict ptrC,float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1)
  {
      __m128 t1,t2,t3,t4,t5,t6,t7,t8;
      __m128i mask;
@@ -610,14 +610,63 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      gmx_mm_maskstore_ps(ptrD,mask,t8);
  }
  
-
-
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                                  x1,y1,z1,x2,y2,z2,x3,y3,z3) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6;\
+    __m128 _tA,_tB,_tC,_tD;\
+\
+    _t1         = _mm256_loadu_ps(ptrA);\
+    _t2         = _mm256_loadu_ps(ptrB);\
+    _t3         = _mm256_loadu_ps(ptrC);\
+    _t4         = _mm256_loadu_ps(ptrD);\
+    _tA         = _mm_load_ss(ptrA+8);\
+    _tB         = _mm_load_ss(ptrB+8);\
+    _tC         = _mm_load_ss(ptrC+8);\
+    _tD         = _mm_load_ss(ptrD+8);\
+    _t5         = _mm256_unpacklo_ps(x1,y1);\
+    x1          = _mm256_unpackhi_ps(x1,y1);\
+    y1          = _mm256_unpacklo_ps(z1,x2);\
+    z1          = _mm256_unpackhi_ps(z1,x2);\
+    x2          = _mm256_unpacklo_ps(y2,z2);\
+    y2          = _mm256_unpackhi_ps(y2,z2);\
+    _t6         = _mm256_unpacklo_ps(x3,y3);\
+    x3          = _mm256_unpackhi_ps(x3,y3);\
+    _t5         = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+    x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+    y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1);\
+    z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+    z2          = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(1,0,1,0));\
+    _t5         = _mm256_shuffle_ps(_t5,y1,_MM_SHUFFLE(3,2,3,2));\
+    y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+    x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_sub_ps(_t1,z2);\
+    _t2         = _mm256_sub_ps(_t2,_t5);\
+    _t3         = _mm256_sub_ps(_t3,y1);\
+    _t4         = _mm256_sub_ps(_t4,x1);\
+    _tA         = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3));\
+    _tB         = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(1,1,1,1)));\
+    _tC         = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(2,2,2,2)));\
+    _tD         = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(3,3,3,3)));\
+    _mm256_storeu_ps(ptrA,_t1);\
+    _mm256_storeu_ps(ptrB,_t2);\
+    _mm256_storeu_ps(ptrC,_t3);\
+    _mm256_storeu_ps(ptrD,_t4);\
+    _mm_store_ss(ptrA+8,_tA);\
+    _mm_store_ss(ptrB+8,_tB);\
+    _mm_store_ss(ptrC+8,_tC);\
+    _mm_store_ss(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3)
  {
      __m256 t1,t2,t3,t4,t5,t6;
      __m128 tA,tB,tC,tD;
@@ -672,15 +721,76 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_store_ss(ptrC+8,tC);
      _mm_store_ss(ptrD+8,tD);
  }
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+                                                  x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5;\
+    __m128 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH;\
+\
+    _t1         = _mm256_loadu_ps(ptrA);\
+    _t2         = _mm256_loadu_ps(ptrB);\
+    _t3         = _mm256_loadu_ps(ptrC);\
+    _t4         = _mm256_loadu_ps(ptrD);\
+    _tA         = _mm_loadu_ps(ptrA+8);\
+    _tB         = _mm_loadu_ps(ptrB+8);\
+    _tC         = _mm_loadu_ps(ptrC+8);\
+    _tD         = _mm_loadu_ps(ptrD+8);\
+    _t5         = _mm256_unpacklo_ps(x1,y1);\
+    x1          = _mm256_unpackhi_ps(x1,y1);\
+    y1          = _mm256_unpacklo_ps(z1,x2);\
+    z1          = _mm256_unpackhi_ps(z1,x2);\
+    x2          = _mm256_unpacklo_ps(y2,z2);\
+    y2          = _mm256_unpackhi_ps(y2,z2);\
+    z2          = _mm256_unpacklo_ps(x3,y3);\
+    x3          = _mm256_unpackhi_ps(x3,y3);\
+    y3          = _mm256_unpacklo_ps(z3,x4);\
+    z3          = _mm256_unpackhi_ps(z3,x4);\
+    x4          = _mm256_unpacklo_ps(y4,z4);\
+    y4          = _mm256_unpackhi_ps(y4,z4);\
+    x2          = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1);\
+    x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1);\
+    y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1);\
+    z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1);\
+    z2          = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(1,0,1,0));\
+    _t5         = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(3,2,3,2));\
+    y1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0));\
+    x1          = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2));\
+    _tE         = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(1,0,1,0));\
+    _tF         = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(3,2,3,2));\
+    _tG         = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(1,0,1,0));\
+    _tH         = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_sub_ps(_t1,z2);\
+    _t2         = _mm256_sub_ps(_t2,_t5);\
+    _t3         = _mm256_sub_ps(_t3,y1);\
+    _t4         = _mm256_sub_ps(_t4,x1);\
+    _tA         = _mm_sub_ps(_tA,_tE);\
+    _tB         = _mm_sub_ps(_tB,_tF);\
+    _tC         = _mm_sub_ps(_tC,_tG);\
+    _tD         = _mm_sub_ps(_tD,_tH);\
+    _mm256_storeu_ps(ptrA,_t1);\
+    _mm256_storeu_ps(ptrB,_t2);\
+    _mm256_storeu_ps(ptrC,_t3);\
+    _mm256_storeu_ps(ptrD,_t4);\
+    _mm_storeu_ps(ptrA+8,_tA);\
+    _mm_storeu_ps(ptrB+8,_tB);\
+    _mm_storeu_ps(ptrC+8,_tC);\
+    _mm_storeu_ps(ptrD+8,_tD);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3,
-                                          __m256 x4, __m256 y4, __m256 z4)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3,
+        __m256 x4, __m256 y4, __m256 z4)
  {
      __m256 t1,t2,t3,t4,t5;
      __m128 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -745,15 +855,15 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_storeu_ps(ptrC+8,tC);
      _mm_storeu_ps(ptrD+8,tD);
  }
-
+#endif
  
  
  static gmx_inline void
  gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1)
  {
      __m256 t1,t2,t3,t4,t5,t6;
      __m256 tA,tB,tC,tD;
@@ -791,14 +901,91 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
  
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{ \
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+    _tA         = _mm256_loadu_ps(ptrA);\
+    _tB         = _mm256_loadu_ps(ptrB);\
+    _tC         = _mm256_loadu_ps(ptrC);\
+    _tD         = _mm256_loadu_ps(ptrD);\
+    _tE         = _mm256_loadu_ps(ptrE);\
+    _tF         = _mm256_loadu_ps(ptrF);\
+    _tG         = _mm256_loadu_ps(ptrG);\
+    _tH         = _mm256_loadu_ps(ptrH);\
+    _t1         = _mm256_unpacklo_ps(_x1,_y1);\
+    _t2         = _mm256_unpackhi_ps(_x1,_y1);\
+    _t3         = _mm256_unpacklo_ps(_z1,_x2);\
+    _t4         = _mm256_unpackhi_ps(_z1,_x2);\
+    _t5         = _mm256_unpacklo_ps(_y2,_z2);\
+    _t6         = _mm256_unpackhi_ps(_y2,_z2);\
+    _t7         = _mm256_unpacklo_ps(_x3,_y3);\
+    _t8         = _mm256_unpackhi_ps(_x3,_y3);\
+    _t9         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t10        = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t11        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t12        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+    _t2         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+    _t3         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+    _t4         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+    _t5         = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+    _t6         = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+    _t7         = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+    _t8         = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+    _t1         = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+    _t2         = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+    _t9         = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+    _t10        = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+    _tA         = _mm256_sub_ps(_tA,_t5);\
+    _tB         = _mm256_sub_ps(_tB,_t7);\
+    _tC         = _mm256_sub_ps(_tC,_t1);\
+    _tD         = _mm256_sub_ps(_tD,_t9);\
+    _tE         = _mm256_sub_ps(_tE,_t6);\
+    _tF         = _mm256_sub_ps(_tF,_t8);\
+    _tG         = _mm256_sub_ps(_tG,_t2);\
+    _tH         = _mm256_sub_ps(_tH,_t10);\
+    _mm256_storeu_ps(ptrA,_tA);\
+    _mm256_storeu_ps(ptrB,_tB);\
+    _mm256_storeu_ps(ptrC,_tC);\
+    _mm256_storeu_ps(ptrD,_tD);\
+    _mm256_storeu_ps(ptrE,_tE);\
+    _mm256_storeu_ps(ptrF,_tF);\
+    _mm256_storeu_ps(ptrG,_tG);\
+    _mm256_storeu_ps(ptrH,_tH);\
+    _tI         = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));\
+    _tJ         = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));\
+    _tK         = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));\
+    _tL         = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));\
+    _tI         = _mm256_unpacklo_ps(_tI,_tK);\
+    _tJ         = _mm256_unpacklo_ps(_tJ,_tL);\
+    _tI         = _mm256_unpacklo_ps(_tI,_tJ);\
+    _tI         = _mm256_sub_ps(_tI,_z3);\
+    _tJ         = _mm256_permute_ps(_tI,_MM_SHUFFLE(1,1,1,1));\
+    _tK         = _mm256_permute_ps(_tI,_MM_SHUFFLE(2,2,2,2));\
+    _tL         = _mm256_permute_ps(_tI,_MM_SHUFFLE(3,3,3,3));\
+    _mm_store_ss(ptrA+8,_mm256_castps256_ps128(_tI));\
+    _mm_store_ss(ptrB+8,_mm256_castps256_ps128(_tJ));\
+    _mm_store_ss(ptrC+8,_mm256_castps256_ps128(_tK));\
+    _mm_store_ss(ptrD+8,_mm256_castps256_ps128(_tL));\
+    _mm_store_ss(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+    _mm_store_ss(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+    _mm_store_ss(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+    _mm_store_ss(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3)
  {
      __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
      __m256 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -859,12 +1046,12 @@ gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm256_storeu_ps(ptrF,tF);
      _mm256_storeu_ps(ptrG,tG);
      _mm256_storeu_ps(ptrH,tH);
-    
+
      tI          = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
      tJ          = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
      tK          = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
      tL          = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
-    
+
      tI          = _mm256_unpacklo_ps(tI,tK);  /*  -  - zG zE |  -  - zC zA */
      tJ          = _mm256_unpacklo_ps(tJ,tL);  /*  -  - zH zF |  -  - zD zB */
      tI          = _mm256_unpacklo_ps(tI,tJ);  /* zH zG zF zE | zD zC zB zA */
@@ -883,17 +1070,102 @@ gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_store_ss(ptrG+8,_mm256_extractf128_ps(tK,0x1));
      _mm_store_ss(ptrH+8,_mm256_extractf128_ps(tL,0x1));
  }
-
-
+#endif
+
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD,ptrE,ptrF,ptrG,ptrH, \
+                                                  _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m256 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m256 _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+\
+    _tA         = _mm256_loadu_ps(ptrA);\
+    _tB         = _mm256_loadu_ps(ptrB);\
+    _tC         = _mm256_loadu_ps(ptrC);\
+    _tD         = _mm256_loadu_ps(ptrD);\
+    _tE         = _mm256_loadu_ps(ptrE);\
+    _tF         = _mm256_loadu_ps(ptrF);\
+    _tG         = _mm256_loadu_ps(ptrG);\
+    _tH         = _mm256_loadu_ps(ptrH);\
+    _t1         = _mm256_unpacklo_ps(_x1,_y1);\
+    _t2         = _mm256_unpackhi_ps(_x1,_y1);\
+    _t3         = _mm256_unpacklo_ps(_z1,_x2);\
+    _t4         = _mm256_unpackhi_ps(_z1,_x2);\
+    _t5         = _mm256_unpacklo_ps(_y2,_z2);\
+    _t6         = _mm256_unpackhi_ps(_y2,_z2);\
+    _t7         = _mm256_unpacklo_ps(_x3,_y3);\
+    _t8         = _mm256_unpackhi_ps(_x3,_y3);\
+    _t9         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t10        = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t11        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t12        = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _t1         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(1,0,1,0));\
+    _t2         = _mm256_shuffle_ps(_t5,_t7,_MM_SHUFFLE(3,2,3,2));\
+    _t3         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(1,0,1,0));\
+    _t4         = _mm256_shuffle_ps(_t6,_t8,_MM_SHUFFLE(3,2,3,2));\
+    _t5         = gmx_mm256_unpack128lo_ps(_t9,_t1);\
+    _t6         = gmx_mm256_unpack128hi_ps(_t9,_t1);\
+    _t7         = gmx_mm256_unpack128lo_ps(_t10,_t2);\
+    _t8         = gmx_mm256_unpack128hi_ps(_t10,_t2);\
+    _t1         = gmx_mm256_unpack128lo_ps(_t11,_t3);\
+    _t2         = gmx_mm256_unpack128hi_ps(_t11,_t3);\
+    _t9         = gmx_mm256_unpack128lo_ps(_t12,_t4);\
+    _t10        = gmx_mm256_unpack128hi_ps(_t12,_t4);\
+    _tA         = _mm256_sub_ps(_tA,_t5);\
+    _tB         = _mm256_sub_ps(_tB,_t7);\
+    _tC         = _mm256_sub_ps(_tC,_t1);\
+    _tD         = _mm256_sub_ps(_tD,_t9);\
+    _tE         = _mm256_sub_ps(_tE,_t6);\
+    _tF         = _mm256_sub_ps(_tF,_t8);\
+    _tG         = _mm256_sub_ps(_tG,_t2);\
+    _tH         = _mm256_sub_ps(_tH,_t10);\
+    _mm256_storeu_ps(ptrA,_tA);\
+    _mm256_storeu_ps(ptrB,_tB);\
+    _mm256_storeu_ps(ptrC,_tC);\
+    _mm256_storeu_ps(ptrD,_tD);\
+    _mm256_storeu_ps(ptrE,_tE);\
+    _mm256_storeu_ps(ptrF,_tF);\
+    _mm256_storeu_ps(ptrG,_tG);\
+    _mm256_storeu_ps(ptrH,_tH);\
+    _tI         = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8));\
+    _tJ         = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));\
+    _tK         = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));\
+    _tL         = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));\
+    _t1         = _mm256_unpacklo_ps(_z3,_x4);\
+    _t2         = _mm256_unpackhi_ps(_z3,_x4);\
+    _t3         = _mm256_unpacklo_ps(_y4,_z4);\
+    _t4         = _mm256_unpackhi_ps(_y4,_z4);\
+    _t5         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(1,0,1,0));\
+    _t6         = _mm256_shuffle_ps(_t1,_t3,_MM_SHUFFLE(3,2,3,2));\
+    _t7         = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(1,0,1,0));\
+    _t8         = _mm256_shuffle_ps(_t2,_t4,_MM_SHUFFLE(3,2,3,2));\
+    _tI         = _mm256_sub_ps(_tI,_t5);\
+    _tJ         = _mm256_sub_ps(_tJ,_t6);\
+    _tK         = _mm256_sub_ps(_tK,_t7);\
+    _tL         = _mm256_sub_ps(_tL,_t8);\
+    _mm_storeu_ps(ptrA+8,_mm256_castps256_ps128(_tI));\
+    _mm_storeu_ps(ptrB+8,_mm256_castps256_ps128(_tJ));\
+    _mm_storeu_ps(ptrC+8,_mm256_castps256_ps128(_tK));\
+    _mm_storeu_ps(ptrD+8,_mm256_castps256_ps128(_tL));\
+    _mm_storeu_ps(ptrE+8,_mm256_extractf128_ps(_tI,0x1));\
+    _mm_storeu_ps(ptrF+8,_mm256_extractf128_ps(_tJ,0x1));\
+    _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(_tK,0x1));\
+    _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(_tL,0x1));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
-                                          float * gmx_restrict ptrC, float * gmx_restrict ptrD,
-                                          float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-                                          float * gmx_restrict ptrG, float * gmx_restrict ptrH,
-                                          __m256 x1, __m256 y1, __m256 z1,
-                                          __m256 x2, __m256 y2, __m256 z2,
-                                          __m256 x3, __m256 y3, __m256 z3,
-                                          __m256 x4, __m256 y4, __m256 z4)
+        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
+        float * gmx_restrict ptrE, float * gmx_restrict ptrF,
+        float * gmx_restrict ptrG, float * gmx_restrict ptrH,
+        __m256 x1, __m256 y1, __m256 z1,
+        __m256 x2, __m256 y2, __m256 z2,
+        __m256 x3, __m256 y3, __m256 z3,
+        __m256 x4, __m256 y4, __m256 z4)
  {
      __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
      __m256 tA,tB,tC,tD,tE,tF,tG,tH;
@@ -959,7 +1231,7 @@ gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      tJ          = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));
      tK          = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));
      tL          = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));
-    
+
      t1          = _mm256_unpacklo_ps(z3,x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
      t2          = _mm256_unpackhi_ps(z3,x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
      t3          = _mm256_unpacklo_ps(y4,z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
@@ -984,13 +1256,13 @@ gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(tK,0x1));
      _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(tL,0x1));
  }
-
+#endif
  
  
  static gmx_inline void
  gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                      float * gmx_restrict fptr,
-                                      float * gmx_restrict fshiftptr)
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
  {
      __m128 t1,t2,t3;
  
@@ -1000,7 +1272,7 @@ gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
  
      /* Add across the two lanes */
      t1   = _mm_add_ps(_mm256_castps256_ps128(fix1),_mm256_extractf128_ps(fix1,0x1));
-    
+
      t2 = _mm_load_ss(fptr);
      t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
      t3 = _mm_load_ss(fshiftptr);
@@ -1015,12 +1287,53 @@ gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                                 fptr,fshiftptr) \
+{ \
+    __m256 _t1,_t2,_t3;\
+    __m128 _tA,_tB,_tC;\
+\
+    fix1 = _mm256_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm256_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+    fix1 = _mm256_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps());\
+\
+    _t1  = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+    _t2  = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+    _t1  = _mm256_add_ps(_t1,_t2);\
+    _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+    _t3  = _mm256_loadu_ps(fptr);\
+    _t3  = _mm256_add_ps(_t3,_t1);\
+    _mm256_storeu_ps(fptr,_t3);\
+    _tB  = _mm_load_ss(fptr+8);\
+    _tB  = _mm_add_ss(_tB,_tA);\
+    _mm_store_ss(fptr+8,_tB);\
+\
+    _tB  = _mm256_extractf128_ps(_t1,0x1);\
+    _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+    _tB  = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+    _tC  = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+    _tB  = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+    _tA  = _mm_add_ps(_tB,_tC);\
+    _tA  = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+    _tC  = _mm_loadu_ps(fshiftptr);\
+    _tC  = _mm_add_ps(_tC,_tA);\
+    _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                         __m256 fix2, __m256 fiy2, __m256 fiz2,
-                                         __m256 fix3, __m256 fiy3, __m256 fiz3,
-                                         float * gmx_restrict fptr,
-                                         float * gmx_restrict fshiftptr)
+        __m256 fix2, __m256 fiy2, __m256 fiz2,
+        __m256 fix3, __m256 fiy3, __m256 fiz3,
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
  {
      __m256 t1,t2,t3;
      __m128 tA,tB,tC;
@@ -1057,22 +1370,68 @@ gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
  
      tB   = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
      tA   = _mm_add_ps(tB,tC); /*  - z y x */
-    
+
      tA   = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
  
      tC   = _mm_loadu_ps(fshiftptr);
      tC   = _mm_add_ps(tC,tA);
      _mm_storeu_ps(fshiftptr,tC);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                                fptr,fshiftptr) \
+{ \
+    __m256 _t1,_t2,_t3; \
+    __m128 _tA,_tB,_tC; \
+\
+    fix1 = _mm256_hadd_ps(fix1,fiy1);\
+    fiz1 = _mm256_hadd_ps(fiz1,fix2);\
+    fiy2 = _mm256_hadd_ps(fiy2,fiz2);\
+    fix3 = _mm256_hadd_ps(fix3,fiy3);\
+    fiz3 = _mm256_hadd_ps(fiz3,fix4);\
+    fiy4 = _mm256_hadd_ps(fiy4,fiz4);\
+\
+    fix1 = _mm256_hadd_ps(fix1,fiz1);\
+    fiy2 = _mm256_hadd_ps(fiy2,fix3);\
+    fiz3 = _mm256_hadd_ps(fiz3,fiy4);\
+\
+    _t1  = gmx_mm256_unpack128lo_ps(fix1,fiy2);\
+    _t2  = gmx_mm256_unpack128hi_ps(fix1,fiy2);\
+    _t1  = _mm256_add_ps(_t1,_t2);\
+    _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1));\
+    _t3  = _mm256_loadu_ps(fptr);\
+    _t3  = _mm256_add_ps(_t3,_t1);\
+    _mm256_storeu_ps(fptr,_t3);\
+    _tB  = _mm_loadu_ps(fptr+8);\
+    _tB  = _mm_add_ps(_tB,_tA);\
+    _mm_storeu_ps(fptr+8,_tB);\
+\
+    _tB  = _mm256_extractf128_ps(_t1,0x1);\
+    _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1),_tB,_MM_SHUFFLE(1,0,3,3));\
+    _tB  = _mm_shuffle_ps(_tB,_tA,_MM_SHUFFLE(1,0,3,2));\
+    _tC  = _mm_permute_ps(_tC,_MM_SHUFFLE(3,3,2,0));\
+    _tA  = _mm_permute_ps(_tA,_MM_SHUFFLE(0,3,2,1));\
+    _tB  = _mm_add_ps(_tB,_mm256_castps256_ps128(_t1));\
+    _tA  = _mm_add_ps(_tA,_tC);\
+    _tA  = _mm_add_ps(_tA,_tB);\
+    _tA  = _mm_blend_ps(_mm_setzero_ps(),_tA,0x7);\
+    _tC  = _mm_loadu_ps(fshiftptr);\
+    _tC  = _mm_add_ps(_tC,_tA);\
+    _mm_storeu_ps(fshiftptr,_tC);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
-                                         __m256 fix2, __m256 fiy2, __m256 fiz2,
-                                         __m256 fix3, __m256 fiy3, __m256 fiz3,
-                                         __m256 fix4, __m256 fiy4, __m256 fiz4,
-                                         float * gmx_restrict fptr,
-                                         float * gmx_restrict fshiftptr)
+        __m256 fix2, __m256 fiy2, __m256 fiz2,
+        __m256 fix3, __m256 fiy3, __m256 fiz3,
+        __m256 fix4, __m256 fiy4, __m256 fiz4,
+        float * gmx_restrict fptr,
+        float * gmx_restrict fshiftptr)
  {
      __m256 t1,t2,t3;
      __m128 tA,tB,tC;
@@ -1120,6 +1479,7 @@ gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
      tC   = _mm_add_ps(tC,tA);
      _mm_storeu_ps(fshiftptr,tC);
  }
+#endif
  
  
  
@@ -1153,26 +1513,4 @@ gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
-gmx_mm256_update_4pot_ps(__m256 pot1, float * gmx_restrict ptrA,
-                         __m256 pot2, float * gmx_restrict ptrB,
-                         __m256 pot3, float * gmx_restrict ptrC,
-                         __m256 pot4, float * gmx_restrict ptrD)
-{
-    __m128 t1,t2,t3,t4;
-
-    pot1 = _mm256_hadd_ps(pot1,pot2);
-    pot3 = _mm256_hadd_ps(pot3,pot4);
-    pot1 = _mm256_hadd_ps(pot1,pot3);
-    t1   = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
-    t2   = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
-    t3   = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
-    t4   = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
-    _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
-    _mm_store_ss(ptrC,_mm_add_ss(_mm_load_ss(ptrC),t3));
-    _mm_store_ss(ptrD,_mm_add_ss(_mm_load_ss(ptrD),t4));
-}
-
-
  #endif /* _kernelutil_x86_avx_256_single_h_ */
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h b/src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h

index 006439173d4e8011d395e3195bbdd33580354c0a..35fb80eafc4936c869567813feb2868270545f5b 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
+++ b/src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
@@ -138,10 +138,10 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
  
  static gmx_inline void
  gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
  {
      __m128d mem_xy,mem_z,mem_sxy,mem_sz;
  
@@ -161,10 +161,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
  {
      __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
  
@@ -199,11 +199,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
  {
      __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
  
@@ -247,9 +247,9 @@ static gmx_inline void
  gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
  {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
  }
  
  static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                    __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
  {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
  }
  
  static gmx_inline void
@@ -385,7 +385,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  /* Routines to decrement rvec in memory, typically use for j particle force updates */
  static gmx_inline void
  gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
+        __m128d xy, __m128d z)
  {
      __m128d t1,t2;
  
@@ -399,77 +399,6 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrA+2,t2);
  }
  
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
  
  static gmx_inline void
  gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
@@ -490,6 +419,33 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
  }
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_load_sd(ptrA+8);\
+    _x1          = _mm_unpacklo_pd(_x1,_y1);\
+    _z1          = _mm_unpacklo_pd(_z1,_x2);\
+    _y2          = _mm_unpacklo_pd(_y2,_z2);\
+    _x3          = _mm_unpacklo_pd(_x3,_y3);\
+    _t1          = _mm_sub_pd(_t1,_x1);\
+    _t2          = _mm_sub_pd(_t2,_z1);\
+    _t3          = _mm_sub_pd(_t3,_y2);\
+    _t4          = _mm_sub_pd(_t4,_x3);\
+    _t5          = _mm_sub_sd(_t5,_z3);\
+    _mm_storeu_pd(ptrA,_t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -521,8 +477,35 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+6,t4);
      _mm_store_sd(ptrA+8,t5);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_loadu_pd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrA+10);\
+    _x1          = _mm_unpacklo_pd(_x1,_y1);\
+    _z1          = _mm_unpacklo_pd(_z1,_x2);\
+    _y2          = _mm_unpacklo_pd(_y2,_z2);\
+    _x3          = _mm_unpacklo_pd(_x3,_y3);\
+    _z3          = _mm_unpacklo_pd(_z3,_x4);\
+    _y4          = _mm_unpacklo_pd(_y4,_z4);\
+    _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+    _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+    _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+    _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+    _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+    _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -553,6 +536,8 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
      _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
  }
+#endif
+
  
  static gmx_inline void
  gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
@@ -581,6 +566,54 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_store_sd(ptrB+2,t4);
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+    __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_load_sd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrB);\
+    _t7          = _mm_loadu_pd(ptrB+2);\
+    _t8          = _mm_loadu_pd(ptrB+4);\
+    _t9          = _mm_loadu_pd(ptrB+6);\
+    _t10         = _mm_load_sd(ptrB+8);\
+    _tA          = _mm_unpacklo_pd(_x1,_y1);\
+    _tB          = _mm_unpackhi_pd(_x1,_y1);\
+    _tC          = _mm_unpacklo_pd(_z1,_x2);\
+    _tD          = _mm_unpackhi_pd(_z1,_x2);\
+    _tE          = _mm_unpacklo_pd(_y2,_z2);\
+    _tF          = _mm_unpackhi_pd(_y2,_z2);\
+    _tG          = _mm_unpacklo_pd(_x3,_y3);\
+    _tH          = _mm_unpackhi_pd(_x3,_y3);\
+    _tI          = _mm_unpackhi_pd(_z3,_z3);\
+    _t1          = _mm_sub_pd(_t1,_tA);\
+    _t2          = _mm_sub_pd(_t2,_tC);\
+    _t3          = _mm_sub_pd(_t3,_tE);\
+    _t4          = _mm_sub_pd(_t4,_tG);\
+    _t5          = _mm_sub_sd(_t5,_z3);\
+    _t6          = _mm_sub_pd(_t6,_tB);\
+    _t7          = _mm_sub_pd(_t7,_tD);\
+    _t8          = _mm_sub_pd(_t8,_tF);\
+    _t9          = _mm_sub_pd(_t9,_tH);\
+    _t10         = _mm_sub_sd(_t10,_tI);\
+    _mm_storeu_pd(ptrA,_t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_store_sd(ptrA+8,_t5);\
+    _mm_storeu_pd(ptrB,_t6);\
+    _mm_storeu_pd(ptrB+2,_t7);\
+    _mm_storeu_pd(ptrB+4,_t8);\
+    _mm_storeu_pd(ptrB+6,_t9);\
+    _mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -634,8 +667,66 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+6,t9);
      _mm_store_sd(ptrB+8,t10);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+    __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+    __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+    _t1          = _mm_loadu_pd(ptrA);\
+    _t2          = _mm_loadu_pd(ptrA+2);\
+    _t3          = _mm_loadu_pd(ptrA+4);\
+    _t4          = _mm_loadu_pd(ptrA+6);\
+    _t5          = _mm_loadu_pd(ptrA+8);\
+    _t6          = _mm_loadu_pd(ptrA+10);\
+    _t7          = _mm_loadu_pd(ptrB);\
+    _t8          = _mm_loadu_pd(ptrB+2);\
+    _t9          = _mm_loadu_pd(ptrB+4);\
+    _t10         = _mm_loadu_pd(ptrB+6);\
+    _t11         = _mm_loadu_pd(ptrB+8);\
+    _t12         = _mm_loadu_pd(ptrB+10);\
+    _tA          = _mm_unpacklo_pd(_x1,_y1);\
+    _tB          = _mm_unpackhi_pd(_x1,_y1);\
+    _tC          = _mm_unpacklo_pd(_z1,_x2);\
+    _tD          = _mm_unpackhi_pd(_z1,_x2);\
+    _tE          = _mm_unpacklo_pd(_y2,_z2);\
+    _tF          = _mm_unpackhi_pd(_y2,_z2);\
+    _tG          = _mm_unpacklo_pd(_x3,_y3);\
+    _tH          = _mm_unpackhi_pd(_x3,_y3);\
+    _tI          = _mm_unpacklo_pd(_z3,_x4);\
+    _tJ          = _mm_unpackhi_pd(_z3,_x4);\
+    _tK          = _mm_unpacklo_pd(_y4,_z4);\
+    _tL          = _mm_unpackhi_pd(_y4,_z4);\
+    _t1          = _mm_sub_pd(_t1,_tA);\
+    _t2          = _mm_sub_pd(_t2,_tC);\
+    _t3          = _mm_sub_pd(_t3,_tE);\
+    _t4          = _mm_sub_pd(_t4,_tG);\
+    _t5          = _mm_sub_pd(_t5,_tI);\
+    _t6          = _mm_sub_pd(_t6,_tK);\
+    _t7          = _mm_sub_pd(_t7,_tB);\
+    _t8          = _mm_sub_pd(_t8,_tD);\
+    _t9          = _mm_sub_pd(_t9,_tF);\
+    _t10         = _mm_sub_pd(_t10,_tH);\
+    _t11         = _mm_sub_pd(_t11,_tJ);\
+    _t12         = _mm_sub_pd(_t12,_tL);\
+    _mm_storeu_pd(ptrA,  _t1);\
+    _mm_storeu_pd(ptrA+2,_t2);\
+    _mm_storeu_pd(ptrA+4,_t3);\
+    _mm_storeu_pd(ptrA+6,_t4);\
+    _mm_storeu_pd(ptrA+8,_t5);\
+    _mm_storeu_pd(ptrA+10,_t6);\
+    _mm_storeu_pd(ptrB,  _t7);\
+    _mm_storeu_pd(ptrB+2,_t8);\
+    _mm_storeu_pd(ptrB+4,_t9);\
+    _mm_storeu_pd(ptrB+6,_t10);\
+    _mm_storeu_pd(ptrB+8,_t11);\
+    _mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -699,6 +790,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+8,t11);
      _mm_storeu_pd(ptrB+10,t12);
  }
+#endif
  
  
  
@@ -726,6 +818,39 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+    GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+    GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+    _t1 = fix3;\
+    fix3 = _mm_unpacklo_pd(fix3,fiy3);\
+    fiy3 = _mm_unpackhi_pd(_t1,fiy3);\
+    fix1 = _mm_add_pd(fix1,fiy1);\
+    fiz1 = _mm_add_pd(fiz1,fix2);\
+    fiy2 = _mm_add_pd(fiy2,fiz2);\
+    fix3 = _mm_add_pd(fix3,fiy3);\
+    fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3));\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+    _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    fiz1 = _mm_add_sd(fiz1,_t2);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -767,8 +892,46 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
+    GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
+    GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
+    GMX_MM_TRANSPOSE2_PD(fix3,fiy3);\
+    GMX_MM_TRANSPOSE2_PD(fiz3,fix4);\
+    GMX_MM_TRANSPOSE2_PD(fiy4,fiz4);\
+    fix1 = _mm_add_pd(fix1,fiy1);\
+    fiz1 = _mm_add_pd(fiz1,fix2);\
+    fiy2 = _mm_add_pd(fiy2,fiz2);\
+    fix3 = _mm_add_pd(fix3,fiy3);\
+    fiz3 = _mm_add_pd(fiz3,fix4);\
+    fiy4 = _mm_add_pd(fiy4,fiz4);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+    _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+    _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+    _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+    fix3 = _mm_add_pd(fix3,_t2);\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+    fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -793,7 +956,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      fix3 = _mm_add_pd(fix3,fiy3);
      fiz3 = _mm_add_pd(fiz3,fix4);
      fiy4 = _mm_add_pd(fiy4,fiz4);
-    
+
      _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
      _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
      _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
@@ -814,7 +977,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-
+#endif
  
  
  static gmx_inline void
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h b/src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h

index 8174970457d3fcfe9bfe4d6d2d0893b831c9f24f..7d3ff2ab7aaeabad285036343c73a249819b2958 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
+++ b/src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
@@ -38,7 +38,7 @@
  
  /* We require SSE2 now! */
  
-#include <math.h> 
+#include <math.h>
  
  #include "gmx_x86_sse2.h"
  
@@ -135,20 +135,20 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
  
  static gmx_inline void
  gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
  {
      __m128 t1,t2,t3,t4;
-    
+
      t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
      t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
      t3   = _mm_load_ss(xyz_shift+2);
      t4   = _mm_load_ss(xyz+2);
      t1   = _mm_add_ps(t1,t2);
      t3   = _mm_add_ss(t3,t4);
-    
+
      *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
      *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
      *z1  = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
@@ -157,30 +157,30 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6;
-    
+
      tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
      tB   = _mm_load_ss(xyz_shift+2);
-    
+
      t1   = _mm_loadu_ps(xyz);
      t2   = _mm_loadu_ps(xyz+4);
      t3   = _mm_load_ss(xyz+8);
-    
+
      tA   = _mm_movelh_ps(tA,tB);
      t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
      t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
      t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
      t1   = _mm_add_ps(t1,t4);
      t2   = _mm_add_ps(t2,t5);
      t3   = _mm_add_ss(t3,t6);
-    
+
      *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
      *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
      *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -195,31 +195,31 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6;
-    
+
      tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
      tB   = _mm_load_ss(xyz_shift+2);
-    
+
      t1   = _mm_loadu_ps(xyz);
      t2   = _mm_loadu_ps(xyz+4);
      t3   = _mm_loadu_ps(xyz+8);
-    
+
      tA   = _mm_movelh_ps(tA,tB);
      t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
      t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
      t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
      t1   = _mm_add_ps(t1,t4);
      t2   = _mm_add_ps(t2,t5);
      t3   = _mm_add_ps(t3,t6);
-    
+
      *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
      *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
      *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -270,7 +270,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrD,
                                    __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                    __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3) 
+                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
  {
      __m128 t1,t2,t3,t4;
      t1            = _mm_loadu_ps(ptrA);
@@ -309,7 +309,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                    __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                    __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4) 
+                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
  {
      __m128 t1,t2,t3,t4;
      t1            = _mm_loadu_ps(ptrA);
@@ -380,12 +380,78 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
  
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18         = _mm_movehl_ps(_z3,_z3);\
+_t19         = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20         = _mm_movelh_ps(_x1,_z1);\
+_t21         = _mm_movehl_ps(_z1,_x1);\
+_t22         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t23         = _mm_movelh_ps(_y2,_x3);\
+_t24         = _mm_movehl_ps(_x3,_y2);\
+_t25         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_load_ss(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t20);\
+_t2          = _mm_sub_ps(_t2,_t23);\
+_t3          = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_load_ss(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_t21);\
+_t5          = _mm_sub_ps(_t5,_t24);\
+_t6          = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_load_ss(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t22);\
+_t8          = _mm_sub_ps(_t8,_t25);\
+_t9          = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_load_ss(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
                                         __m128 x2, __m128 y2, __m128 z2,
-                                       __m128 x3, __m128 y3, __m128 z3) 
+                                       __m128 x3, __m128 y3, __m128 z3)
  {
      __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
      __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
@@ -447,15 +513,87 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4,t11);
      _mm_store_ss(ptrD+8,t12);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_unpackhi_ps(_z3,_x4);\
+_z3          = _mm_unpacklo_ps(_z3,_x4);\
+_t18         = _mm_unpackhi_ps(_y4,_z4);\
+_y4          = _mm_unpacklo_ps(_y4,_z4);\
+_t19         = _mm_movelh_ps(_x1,_z1);\
+_z1          = _mm_movehl_ps(_z1,_x1);\
+_t20         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t21         = _mm_movelh_ps(_y2,_x3);\
+_x3          = _mm_movehl_ps(_x3,_y2);\
+_t22         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t23         = _mm_movelh_ps(_z3,_y4);\
+_y4          = _mm_movehl_ps(_y4,_z3);\
+_t24         = _mm_movelh_ps(_t17,_t18);\
+_t18         = _mm_movehl_ps(_t18,_t17);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_loadu_ps(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t19);\
+_t2          = _mm_sub_ps(_t2,_t21);\
+_t3          = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_loadu_ps(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_z1);\
+_t5          = _mm_sub_ps(_t5,_x3);\
+_t6          = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_loadu_ps(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t20);\
+_t8          = _mm_sub_ps(_t8,_t22);\
+_t9          = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_loadu_ps(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static void
  gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
                                         __m128 x2, __m128 y2, __m128 z2,
                                         __m128 x3, __m128 y3, __m128 z3,
-                                       __m128 x4, __m128 y4, __m128 z4) 
+                                       __m128 x4, __m128 y4, __m128 z4)
  {
      __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
      __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
@@ -521,7 +659,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4,t11);
      _mm_storeu_ps(ptrD+8,t12);
  }
-
+#endif
  
  
  static gmx_inline void
@@ -550,6 +688,38 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4;\
+\
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+    _t2   = _mm_movehl_ps(_mm_setzero_ps(),fiz3);\
+    _t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));\
+    _t3   = _mm_shuffle_ps(_t2,_t2,_MM_SHUFFLE(0,0,0,1));\
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+    fiz3 = _mm_add_ss(_mm_add_ps(fiz3,_t1)  , _mm_add_ps(_t2,_t3));\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+    _t4 = _mm_load_ss(fshiftptr+2);\
+    _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+    _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+    _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+    _t3 = _mm_shuffle_ps(_t3  ,_t3  ,_MM_SHUFFLE(1,2,0,0));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _mm_store_ss(fshiftptr+2,_t1);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -589,8 +759,39 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2,t1);
      _mm_storeh_pi((__m64 *)(fshiftptr),t1);
  }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128 _t1,_t2,_t3,_t4,_t5;\
+    _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);\
+    _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);\
+    _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);\
+    fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));\
+    fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));\
+    fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));\
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+    _t5 = _mm_load_ss(fshiftptr+2);\
+    _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+    _t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+    _t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+    _t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+    _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+    _t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+    _t1 = _mm_add_ps(_t1,_t2);\
+    _t3 = _mm_add_ps(_t3,_t4);\
+    _t1 = _mm_add_ps(_t1,_t3);\
+    _t5 = _mm_add_ps(_t5,_t1);\
+    _mm_store_ss(fshiftptr+2,_t5);\
+    _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -631,7 +832,7 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2,t5);
      _mm_storeh_pi((__m64 *)(fshiftptr),t5);
  }
-
+#endif
  
  
  static void
@@ -658,22 +859,4 @@ gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
  }
  
  
-static void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
-    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-    _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-    _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
-}
-
-
  #endif /* _kernelutil_x86_sse2_single_h_ */
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h

index e7bb484515c65505b2963a05b71e98c29613664f..f304aa5d222f9cbefedebd244abba4041f7974de 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
+++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
@@ -138,10 +138,10 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
  
  static gmx_inline void
  gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
  {
      __m128d mem_xy,mem_z,mem_sxy,mem_sz;
  
@@ -161,10 +161,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
  {
      __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
  
@@ -199,11 +199,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
  {
      __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
  
@@ -247,9 +247,9 @@ static gmx_inline void
  gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
  {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
  }
  
  static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                    __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
  {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
  }
  
  static gmx_inline void
@@ -385,7 +385,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  /* Routines to decrement rvec in memory, typically use for j particle force updates */
  static gmx_inline void
  gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
+        __m128d xy, __m128d z)
  {
      __m128d t1,t2;
  
@@ -399,77 +399,6 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrA+2,t2);
  }
  
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
  
  static gmx_inline void
  gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
@@ -490,6 +419,33 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
  }
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_t1          = _mm_sub_pd(_t1,_x1);\
+_t2          = _mm_sub_pd(_t2,_z1);\
+_t3          = _mm_sub_pd(_t3,_y2);\
+_t4          = _mm_sub_pd(_t4,_x3);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -521,8 +477,35 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+6,t4);
      _mm_store_sd(ptrA+8,t5);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_z3          = _mm_unpacklo_pd(_z3,_x4);\
+_y4          = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -553,6 +536,8 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
      _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
  }
+#endif
+
  
  static gmx_inline void
  gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
@@ -581,6 +566,54 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_store_sd(ptrB+2,t4);
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrB);\
+_t7          = _mm_loadu_pd(ptrB+2);\
+_t8          = _mm_loadu_pd(ptrB+4);\
+_t9          = _mm_loadu_pd(ptrB+6);\
+_t10         = _mm_load_sd(ptrB+8);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpackhi_pd(_z3,_z3);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_t6          = _mm_sub_pd(_t6,_tB);\
+_t7          = _mm_sub_pd(_t7,_tD);\
+_t8          = _mm_sub_pd(_t8,_tF);\
+_t9          = _mm_sub_pd(_t9,_tH);\
+_t10         = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -634,8 +667,66 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+6,t9);
      _mm_store_sd(ptrB+8,t10);
  }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_t7          = _mm_loadu_pd(ptrB);\
+_t8          = _mm_loadu_pd(ptrB+2);\
+_t9          = _mm_loadu_pd(ptrB+4);\
+_t10         = _mm_loadu_pd(ptrB+6);\
+_t11         = _mm_loadu_pd(ptrB+8);\
+_t12         = _mm_loadu_pd(ptrB+10);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpacklo_pd(_z3,_x4);\
+_tJ          = _mm_unpackhi_pd(_z3,_x4);\
+_tK          = _mm_unpacklo_pd(_y4,_z4);\
+_tL          = _mm_unpackhi_pd(_y4,_z4);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_pd(_t5,_tI);\
+_t6          = _mm_sub_pd(_t6,_tK);\
+_t7          = _mm_sub_pd(_t7,_tB);\
+_t8          = _mm_sub_pd(_t8,_tD);\
+_t9          = _mm_sub_pd(_t9,_tF);\
+_t10         = _mm_sub_pd(_t10,_tH);\
+_t11         = _mm_sub_pd(_t11,_tJ);\
+_t12         = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA,  _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB,  _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
@@ -699,7 +790,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+8,t11);
      _mm_storeu_pd(ptrB+10,t12);
  }
-
+#endif
  
  
  
@@ -719,6 +810,34 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
  
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+    _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    fiz1 = _mm_add_sd(fiz1,_t2);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -751,8 +870,39 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fix4);\
+    fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+    _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+    _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+    _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+    fix3 = _mm_add_pd(fix3,_t2);\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+    fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -790,8 +940,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-
-
+#endif
  
  static gmx_inline void
  gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
diff --git a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h

index 761febfada665f306549e3ac3101c3e82df1964f..ef8362c1630fe12a5c73d643e55f8596b0f7f5c5 100644 (file)
--- a/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
+++ b/src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
@@ -36,12 +36,12 @@
  #ifndef _kernelutil_x86_sse4_1_single_h_
  #define _kernelutil_x86_sse4_1_single_h_
  
-#include <math.h> 
+#include <math.h>
  
  #include "gmx_x86_sse4_1.h"
  
  #undef gmx_restrict
-#define gmx_restrict 
+#define gmx_restrict
  
  /* Normal sum of four xmm registers */
  #define gmx_mm_sum4_ps(t0,t1,t2,t3)  _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
@@ -67,7 +67,7 @@ gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
                               const float * gmx_restrict ptrD)
  {
      __m128 t1,t2;
-    
+
      t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
      t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
      return _mm_unpacklo_ps(t1,t2);
@@ -81,14 +81,14 @@ gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
                                __m128 xmm1)
  {
      __m128 t2,t3,t4;
-    
-    t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);               
-    t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));     
-    t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1)); 
-    _mm_store_ss(ptrA,xmm1);                                           
-    _mm_store_ss(ptrB,t2);                                         
-    _mm_store_ss(ptrC,t3);                                         
-    _mm_store_ss(ptrD,t4);                                         
+
+    t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
+    t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+    t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+    _mm_store_ss(ptrA,xmm1);
+    _mm_store_ss(ptrB,t2);
+    _mm_store_ss(ptrC,t3);
+    _mm_store_ss(ptrD,t4);
  }
  
  /* Similar to store, but increments value in memory */
@@ -99,7 +99,7 @@ gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
                                    float * gmx_restrict ptrD, __m128 xmm1)
  {
      __m128 tmp;
-    
+
      tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
      tmp = _mm_add_ps(tmp,xmm1);
      gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
@@ -115,7 +115,7 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                               __m128 * gmx_restrict c12)
  {
      __m128 t1,t2,t3,t4;
-    
+
      t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);   /* - - c12a  c6a */
      t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);   /* - - c12b  c6b */
      t3   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);   /* - - c12c  c6c */
@@ -129,20 +129,20 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
  
  static gmx_inline void
  gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1,
-                                         __m128 * gmx_restrict y1,
-                                         __m128 * gmx_restrict z1)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1,
+        __m128 * gmx_restrict y1,
+        __m128 * gmx_restrict z1)
  {
      __m128 t1,t2,t3,t4;
-    
+
      t1   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
      t2   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
      t3   = _mm_load_ss(xyz_shift+2);
      t4   = _mm_load_ss(xyz+2);
      t1   = _mm_add_ps(t1,t2);
      t3   = _mm_add_ss(t3,t4);
-    
+
      *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
      *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
      *z1  = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
@@ -151,14 +151,14 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6;
-    
+
      tA   = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
      tB   = _mm_load_ss(xyz_shift+2);
  
@@ -170,11 +170,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
      t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
      t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
      t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
      t1   = _mm_add_ps(t1,t4);
      t2   = _mm_add_ps(t2,t5);
      t3   = _mm_add_ss(t3,t6);
-    
+
      *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
      *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
      *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -189,31 +189,31 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  
  static gmx_inline void
  gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
-                                         const float * gmx_restrict xyz,
-                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
-                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+        const float * gmx_restrict xyz,
+        __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+        __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+        __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+        __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
  {
      __m128 tA,tB;
      __m128 t1,t2,t3,t4,t5,t6;
-    
+
      tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
      tB   = _mm_load_ss(xyz_shift+2);
-    
+
      t1   = _mm_loadu_ps(xyz);
      t2   = _mm_loadu_ps(xyz+4);
      t3   = _mm_loadu_ps(xyz+8);
-    
+
      tA   = _mm_movelh_ps(tA,tB);
      t4   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
      t5   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
      t6   = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
-    
+
      t1   = _mm_add_ps(t1,t4);
      t2   = _mm_add_ps(t2,t5);
      t3   = _mm_add_ps(t3,t6);
-    
+
      *x1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
      *y1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
      *z1  = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
@@ -264,7 +264,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrD,
                                    __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                    __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
-                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3) 
+                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
  {
      __m128 t1,t2,t3,t4;
      t1            = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrA ) );
@@ -303,7 +303,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                    __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                    __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
-                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4) 
+                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
  {
      __m128 t1,t2,t3,t4;
      t1            = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA) ) );
@@ -375,12 +375,78 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
  
  
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
+__m128 _t20,_t21,_t22,_t23,_t24,_t25;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_shuffle_ps(_z3,_z3,_MM_SHUFFLE(0,0,0,1));\
+_t18         = _mm_movehl_ps(_z3,_z3);\
+_t19         = _mm_shuffle_ps(_t18,_t18,_MM_SHUFFLE(0,0,0,1));\
+_t20         = _mm_movelh_ps(_x1,_z1);\
+_t21         = _mm_movehl_ps(_z1,_x1);\
+_t22         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t23         = _mm_movelh_ps(_y2,_x3);\
+_t24         = _mm_movehl_ps(_x3,_y2);\
+_t25         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_load_ss(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t20);\
+_t2          = _mm_sub_ps(_t2,_t23);\
+_t3          = _mm_sub_ss(_t3,_z3);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_store_ss(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_load_ss(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_t21);\
+_t5          = _mm_sub_ps(_t5,_t24);\
+_t6          = _mm_sub_ss(_t6,_t17);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_store_ss(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_load_ss(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t22);\
+_t8          = _mm_sub_ps(_t8,_t25);\
+_t9          = _mm_sub_ss(_t9,_t18);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_store_ss(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_load_ss(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ss(_t12,_t19);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_store_ss(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
                                         __m128 x2, __m128 y2, __m128 z2,
-                                       __m128 x3, __m128 y3, __m128 z3) 
+                                       __m128 x3, __m128 y3, __m128 z3)
  {
      __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
      __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
@@ -417,7 +483,7 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      t10         = _mm_loadu_ps(ptrD);
      t11         = _mm_loadu_ps(ptrD+4);
      t12         = _mm_load_ss(ptrD+8);
-    
+
      t1          = _mm_sub_ps(t1,t20);
      t2          = _mm_sub_ps(t2,t23);
      t3          = _mm_sub_ss(t3,z3);
@@ -443,15 +509,86 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4,t11);
      _mm_store_ss(ptrD+8,t12);
  }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
+_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
+__m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
+__m128 _t23,_t24;\
+_t13         = _mm_unpackhi_ps(_x1,_y1);\
+_x1          = _mm_unpacklo_ps(_x1,_y1);\
+_t14         = _mm_unpackhi_ps(_z1,_x2);\
+_z1          = _mm_unpacklo_ps(_z1,_x2);\
+_t15         = _mm_unpackhi_ps(_y2,_z2);\
+_y2          = _mm_unpacklo_ps(_y2,_z2);\
+_t16         = _mm_unpackhi_ps(_x3,_y3);\
+_x3          = _mm_unpacklo_ps(_x3,_y3);\
+_t17         = _mm_unpackhi_ps(_z3,_x4);\
+_z3          = _mm_unpacklo_ps(_z3,_x4);\
+_t18         = _mm_unpackhi_ps(_y4,_z4);\
+_y4          = _mm_unpacklo_ps(_y4,_z4);\
+_t19         = _mm_movelh_ps(_x1,_z1);\
+_z1          = _mm_movehl_ps(_z1,_x1);\
+_t20         = _mm_movelh_ps(_t13,_t14);\
+_t14         = _mm_movehl_ps(_t14,_t13);\
+_t21         = _mm_movelh_ps(_y2,_x3);\
+_x3          = _mm_movehl_ps(_x3,_y2);\
+_t22         = _mm_movelh_ps(_t15,_t16);\
+_t16         = _mm_movehl_ps(_t16,_t15);\
+_t23         = _mm_movelh_ps(_z3,_y4);\
+_y4          = _mm_movehl_ps(_y4,_z3);\
+_t24         = _mm_movelh_ps(_t17,_t18);\
+_t18         = _mm_movehl_ps(_t18,_t17);\
+_t1          = _mm_loadu_ps(ptrA);\
+_t2          = _mm_loadu_ps(ptrA+4);\
+_t3          = _mm_loadu_ps(ptrA+8);\
+_t1          = _mm_sub_ps(_t1,_t19);\
+_t2          = _mm_sub_ps(_t2,_t21);\
+_t3          = _mm_sub_ps(_t3,_t23);\
+_mm_storeu_ps(ptrA,_t1);\
+_mm_storeu_ps(ptrA+4,_t2);\
+_mm_storeu_ps(ptrA+8,_t3);\
+_t4          = _mm_loadu_ps(ptrB);\
+_t5          = _mm_loadu_ps(ptrB+4);\
+_t6          = _mm_loadu_ps(ptrB+8);\
+_t4          = _mm_sub_ps(_t4,_z1);\
+_t5          = _mm_sub_ps(_t5,_x3);\
+_t6          = _mm_sub_ps(_t6,_y4);\
+_mm_storeu_ps(ptrB,_t4);\
+_mm_storeu_ps(ptrB+4,_t5);\
+_mm_storeu_ps(ptrB+8,_t6);\
+_t7          = _mm_loadu_ps(ptrC);\
+_t8          = _mm_loadu_ps(ptrC+4);\
+_t9          = _mm_loadu_ps(ptrC+8);\
+_t7          = _mm_sub_ps(_t7,_t20);\
+_t8          = _mm_sub_ps(_t8,_t22);\
+_t9          = _mm_sub_ps(_t9,_t24);\
+_mm_storeu_ps(ptrC,_t7);\
+_mm_storeu_ps(ptrC+4,_t8);\
+_mm_storeu_ps(ptrC+8,_t9);\
+_t10         = _mm_loadu_ps(ptrD);\
+_t11         = _mm_loadu_ps(ptrD+4);\
+_t12         = _mm_loadu_ps(ptrD+8);\
+_t10         = _mm_sub_ps(_t10,_t14);\
+_t11         = _mm_sub_ps(_t11,_t16);\
+_t12         = _mm_sub_ps(_t12,_t18);\
+_mm_storeu_ps(ptrD,_t10);\
+_mm_storeu_ps(ptrD+4,_t11);\
+_mm_storeu_ps(ptrD+8,_t12);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
                                         __m128 x2, __m128 y2, __m128 z2,
                                         __m128 x3, __m128 y3, __m128 z3,
-                                       __m128 x4, __m128 y4, __m128 z4) 
+                                       __m128 x4, __m128 y4, __m128 z4)
  {
      __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
      __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
@@ -517,7 +654,7 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4,t11);
      _mm_storeu_ps(ptrD+8,t12);
  }
-
+#endif
  
  
  static gmx_inline void
@@ -525,27 +662,59 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        float * gmx_restrict fptr,
                                        float * gmx_restrict fshiftptr)
  {
-       __m128 t2,t3;
-       
+    __m128 t2,t3;
+
      fix1 = _mm_hadd_ps(fix1,fix1);
-       fiy1 = _mm_hadd_ps(fiy1,fiz1);
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
-    
-       t2 = _mm_load_ss(fptr);
-       t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
-       t3 = _mm_load_ss(fshiftptr);
-       t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
-       
-       t2 = _mm_add_ps(t2,fix1);
-       t3 = _mm_add_ps(t3,fix1);
-       
-       _mm_store_ss(fptr,t2);
-       _mm_storeh_pi((__m64 *)(fptr+1),t2);
-       _mm_store_ss(fshiftptr,t3);
-       _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
+    fiy1 = _mm_hadd_ps(fiy1,fiz1);
+
+    fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
+
+    t2 = _mm_load_ss(fptr);
+    t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
+    t3 = _mm_load_ss(fshiftptr);
+    t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
+
+    t2 = _mm_add_ps(t2,fix1);
+    t3 = _mm_add_ps(t3,fix1);
+
+    _mm_store_ss(fptr,t2);
+    _mm_storeh_pi((__m64 *)(fptr+1),t2);
+    _mm_store_ss(fshiftptr,t3);
+    _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
  }
  
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiz3);\
+_mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
+_t4 = _mm_load_ss(fshiftptr+2);\
+_t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
+_t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
+_t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
+_t3 = _mm_shuffle_ps(_t3,_t3,_MM_SHUFFLE(1,2,0,0));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_mm_store_ss(fshiftptr+2,_t1);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -553,39 +722,74 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        float * gmx_restrict fptr,
                                        float * gmx_restrict fshiftptr)
  {
-       __m128 t1,t2,t3,t4;
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1);
-       fiz1 = _mm_hadd_ps(fiz1,fix2);
-       fiy2 = _mm_hadd_ps(fiy2,fiz2);
-       fix3 = _mm_hadd_ps(fix3,fiy3);
-       fiz3 = _mm_hadd_ps(fiz3,fiz3);
-       
-       fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
-       fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
-       fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
-    
-       _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-       _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
-       _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
-       
-       t4 = _mm_load_ss(fshiftptr+2);
-       t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
-       
-       t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
-       t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
-       t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
-       t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
-    
-       t1 = _mm_add_ps(t1,t2);
-       t3 = _mm_add_ps(t3,t4);
-       t1 = _mm_add_ps(t1,t3); /* y x - z */
-       
-       _mm_store_ss(fshiftptr+2,t1);
-       _mm_storeh_pi((__m64 *)(fshiftptr),t1);
-}
+    __m128 t1,t2,t3,t4;
+
+    fix1 = _mm_hadd_ps(fix1,fiy1);
+    fiz1 = _mm_hadd_ps(fiz1,fix2);
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);
+    fix3 = _mm_hadd_ps(fix3,fiy3);
+    fiz3 = _mm_hadd_ps(fiz3,fiz3);
+
+    fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+    fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+    fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
+
+    t4 = _mm_load_ss(fshiftptr+2);
+    t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
+    t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
+    t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
+    t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
  
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3); /* y x - z */
  
+    _mm_store_ss(fshiftptr+2,t1);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t1);
+}
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+fptr,fshiftptr) \
+{\
+__m128 _t1,_t2,_t3,_t4,_t5;\
+\
+fix1 = _mm_hadd_ps(fix1,fiy1);\
+fiz1 = _mm_hadd_ps(fiz1,fix2);\
+fiy2 = _mm_hadd_ps(fiy2,fiz2);\
+fix3 = _mm_hadd_ps(fix3,fiy3);\
+fiz3 = _mm_hadd_ps(fiz3,fix4);\
+fiy4 = _mm_hadd_ps(fiy4,fiz4);\
+fix1 = _mm_hadd_ps(fix1,fiz1);\
+fiy2 = _mm_hadd_ps(fiy2,fix3);\
+fiz3 = _mm_hadd_ps(fiz3,fiy4);\
+_mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));\
+_mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
+_mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
+_t5 = _mm_load_ss(fshiftptr+2);\
+_t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
+_t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));\
+_t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));\
+_t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));\
+_t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
+_t4 = _mm_shuffle_ps(fiz3,_t4  ,_MM_SHUFFLE(2,0,3,3));\
+_t1 = _mm_add_ps(_t1,_t2);\
+_t3 = _mm_add_ps(_t3,_t4);\
+_t1 = _mm_add_ps(_t1,_t3);\
+_t5 = _mm_add_ps(_t5,_t1);\
+_mm_store_ss(fshiftptr+2,_t5);\
+_mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
+}
+#else
+/* Real function for sane compilers */
  static gmx_inline void
  gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
@@ -594,41 +798,41 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        float * gmx_restrict fptr,
                                        float * gmx_restrict fshiftptr)
  {
-       __m128 t1,t2,t3,t4,t5;
-       
-       fix1 = _mm_hadd_ps(fix1,fiy1);
-       fiz1 = _mm_hadd_ps(fiz1,fix2);
-       fiy2 = _mm_hadd_ps(fiy2,fiz2);
-       fix3 = _mm_hadd_ps(fix3,fiy3);
-       fiz3 = _mm_hadd_ps(fiz3,fix4);
-       fiy4 = _mm_hadd_ps(fiy4,fiz4);
-       
-       fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
-       fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
-       fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
-    
-       _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
-       _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
-       _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
-       
-       t5 = _mm_load_ss(fshiftptr+2);
-       t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
-       
-       t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
-       t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
-       t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
-       t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
-       t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
-       
-       t1 = _mm_add_ps(t1,t2);
-       t3 = _mm_add_ps(t3,t4);
-       t1 = _mm_add_ps(t1,t3);
-       t5 = _mm_add_ps(t5,t1);
-       
-       _mm_store_ss(fshiftptr+2,t5);
-       _mm_storeh_pi((__m64 *)(fshiftptr),t5);
+    __m128 t1,t2,t3,t4,t5;
+
+    fix1 = _mm_hadd_ps(fix1,fiy1);
+    fiz1 = _mm_hadd_ps(fiz1,fix2);
+    fiy2 = _mm_hadd_ps(fiy2,fiz2);
+    fix3 = _mm_hadd_ps(fix3,fiy3);
+    fiz3 = _mm_hadd_ps(fiz3,fix4);
+    fiy4 = _mm_hadd_ps(fiy4,fiz4);
+
+    fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
+    fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
+    fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
+
+    _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
+    _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
+    _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
+
+    t5 = _mm_load_ss(fshiftptr+2);
+    t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
+
+    t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
+    t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
+    t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
+    t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
+    t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
+
+    t1 = _mm_add_ps(t1,t2);
+    t3 = _mm_add_ps(t3,t4);
+    t1 = _mm_add_ps(t1,t3);
+    t5 = _mm_add_ps(t5,t1);
+
+    _mm_store_ss(fshiftptr+2,t5);
+    _mm_storeh_pi((__m64 *)(fshiftptr),t5);
  }
-
+#endif
  
  
  static gmx_inline void
@@ -643,33 +847,15 @@ static gmx_inline void
  gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
                        __m128 pot2, float * gmx_restrict ptrB)
  {
-       __m128 t1,t2;
-       t1   = _mm_movehl_ps(pot2,pot1); 
-       t2   = _mm_movelh_ps(pot1,pot2); 
-       t1   = _mm_add_ps(t1,t2);       
-       t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
-       pot1 = _mm_add_ps(t1,t2);       
-       pot2 = _mm_movehl_ps(t2,pot1);
-       _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-       _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-}
-
-
-static gmx_inline void
-gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
-                      __m128 pot2, float * gmx_restrict ptrB,
-                      __m128 pot3, float * gmx_restrict ptrC,
-                      __m128 pot4, float * gmx_restrict ptrD)
-{
-    _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
-    pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
-    pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
-    pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
-    pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
-       _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
-       _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
-       _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
-       _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
+    __m128 t1,t2;
+    t1   = _mm_movehl_ps(pot2,pot1);
+    t2   = _mm_movelh_ps(pot1,pot2);
+    t1   = _mm_add_ps(t1,t2);
+    t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
+    pot1 = _mm_add_ps(t1,t2);
+    pot2 = _mm_movehl_ps(t2,pot1);
+    _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
+    _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
  }
  
  
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h

index 63a2e6a95eca23d52fcbb66d32b701cdb67e6717..a8e068699952102704e734250a68343b9603dd6d 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -118,7 +118,7 @@
      in0 = _mm256_hadd_ps(in0,_mm256_setzero_ps());                      \
      in2 = _mm256_hadd_ps(in2,_mm256_setzero_ps());                      \
      in0 = _mm256_hadd_ps(in0,in2);                                      \
-    in2 = _mm256_permute_ps(in0,0b10110001);                            \
+    in2 = _mm256_permute_ps(in0,_MM_SHUFFLE(2,3,0,1));                  \
      out = _mm_add_ps(_mm256_castps256_ps128(in0),_mm256_extractf128_ps(in2,1)); \
  }
  #else
author	Erik Lindahl <erik@kth.se>
	Fri, 28 Dec 2012 18:40:53 +0000 (19:40 +0100)
committer	Erik Lindahl <erik@kth.se>
	Thu, 10 Jan 2013 07:50:03 +0000 (08:50 +0100)
CMakeLists.txt		patch \| blob \| history
include/gmx_math_x86_avx_128_fma_double.h		patch \| blob \| history
include/gmx_x86_avx_128_fma.h		patch \| blob \| history
src/gmxlib/copyrite.c		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h		patch \| blob \| history
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h		patch \| blob \| history