Fixes SSE/AVX compilation under Windows
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse4_1_double / kernelutil_x86_sse4_1_double.h
index e7bb484515c65505b2963a05b71e98c29613664f..f304aa5d222f9cbefedebd244abba4041f7974de 100644 (file)
@@ -138,10 +138,10 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 
 static gmx_inline void
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1,
-                                         __m128d * gmx_restrict y1,
-                                         __m128d * gmx_restrict z1)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1,
+        __m128d * gmx_restrict y1,
+        __m128d * gmx_restrict z1)
 {
     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
 
@@ -161,10 +161,10 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
 
@@ -199,11 +199,11 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 static gmx_inline void
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
-                                         const double * gmx_restrict xyz,
-                                         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-                                         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
-                                         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
-                                         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
+        const double * gmx_restrict xyz,
+        __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
+        __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
+        __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
+        __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
 {
     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
 
@@ -247,9 +247,9 @@ static gmx_inline void
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
-        *x            = _mm_load_sd(p1);
-     *y            = _mm_load_sd(p1+1);
-     *z            = _mm_load_sd(p1+2);
+    *x            = _mm_load_sd(p1);
+    *y            = _mm_load_sd(p1+1);
+    *z            = _mm_load_sd(p1+2);
 }
 
 static gmx_inline void
@@ -258,15 +258,15 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
 {
-        *x1            = _mm_load_sd(p1);
-     *y1            = _mm_load_sd(p1+1);
-     *z1            = _mm_load_sd(p1+2);
-        *x2            = _mm_load_sd(p1+3);
-     *y2            = _mm_load_sd(p1+4);
-     *z2            = _mm_load_sd(p1+5);
-        *x3            = _mm_load_sd(p1+6);
-     *y3            = _mm_load_sd(p1+7);
-     *z3            = _mm_load_sd(p1+8);
+    *x1            = _mm_load_sd(p1);
+    *y1            = _mm_load_sd(p1+1);
+    *z1            = _mm_load_sd(p1+2);
+    *x2            = _mm_load_sd(p1+3);
+    *y2            = _mm_load_sd(p1+4);
+    *z2            = _mm_load_sd(p1+5);
+    *x3            = _mm_load_sd(p1+6);
+    *y3            = _mm_load_sd(p1+7);
+    *z3            = _mm_load_sd(p1+8);
 }
 
 static gmx_inline void
@@ -385,7 +385,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy, __m128d z)
+        __m128d xy, __m128d z)
 {
     __m128d t1,t2;
 
@@ -399,77 +399,6 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
     _mm_store_sd(ptrA+2,t2);
 }
 
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3)
-{
-    __m128d t1,t2;
-    __m128d tA,tB,tC,tD,tE;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_load_sd(ptrA+8);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_sd(tE,z3);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_store_sd(ptrA+8,tE);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
-                                         __m128d xy1, __m128d z1,
-                                         __m128d xy2, __m128d z2,
-                                         __m128d xy3, __m128d z3,
-                                         __m128d xy4, __m128d z4)
-{
-    __m128d t1,t2,t3,t4;
-    __m128d tA,tB,tC,tD,tE,tF;
-
-    tA   = _mm_loadu_pd(ptrA);
-    tB   = _mm_loadu_pd(ptrA+2);
-    tC   = _mm_loadu_pd(ptrA+4);
-    tD   = _mm_loadu_pd(ptrA+6);
-    tE   = _mm_loadu_pd(ptrA+8);
-    tF   = _mm_loadu_pd(ptrA+10);
-
-    /* xy1: y1 x1 */
-    t1   = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */
-    t2   = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */
-    /* xy3: y3 x3 */
-    t3   = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */
-    t4   = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */
-
-    tA   = _mm_sub_pd(tA,xy1);
-    tB   = _mm_sub_pd(tB,t1);
-    tC   = _mm_sub_pd(tC,t2);
-    tD   = _mm_sub_pd(tD,xy3);
-    tE   = _mm_sub_pd(tE,t3);
-    tF   = _mm_sub_pd(tF,t4);
-
-    _mm_storeu_pd(ptrA,tA);
-    _mm_storeu_pd(ptrA+2,tB);
-    _mm_storeu_pd(ptrA+4,tC);
-    _mm_storeu_pd(ptrA+6,tD);
-    _mm_storeu_pd(ptrA+8,tE);
-    _mm_storeu_pd(ptrA+10,tF);
-}
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
@@ -490,6 +419,33 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_t1          = _mm_sub_pd(_t1,_x1);\
+_t2          = _mm_sub_pd(_t2,_z1);\
+_t3          = _mm_sub_pd(_t3,_y2);\
+_t4          = _mm_sub_pd(_t4,_x3);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -521,8 +477,35 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6,t4);
     _mm_store_sd(ptrA+8,t5);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_x1          = _mm_unpacklo_pd(_x1,_y1);\
+_z1          = _mm_unpacklo_pd(_z1,_x2);\
+_y2          = _mm_unpacklo_pd(_y2,_z2);\
+_x3          = _mm_unpacklo_pd(_x3,_y3);\
+_z3          = _mm_unpacklo_pd(_z3,_x4);\
+_y4          = _mm_unpacklo_pd(_y4,_z4);\
+_mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
+_mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
+_mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
+_mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
+_mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
+_mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -553,6 +536,8 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
 }
+#endif
+
 
 static gmx_inline void
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
@@ -581,6 +566,54 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_store_sd(ptrB+2,t4);
 }
 
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_load_sd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrB);\
+_t7          = _mm_loadu_pd(ptrB+2);\
+_t8          = _mm_loadu_pd(ptrB+4);\
+_t9          = _mm_loadu_pd(ptrB+6);\
+_t10         = _mm_load_sd(ptrB+8);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpackhi_pd(_z3,_z3);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_sd(_t5,_z3);\
+_t6          = _mm_sub_pd(_t6,_tB);\
+_t7          = _mm_sub_pd(_t7,_tD);\
+_t8          = _mm_sub_pd(_t8,_tF);\
+_t9          = _mm_sub_pd(_t9,_tH);\
+_t10         = _mm_sub_sd(_t10,_tI);\
+_mm_storeu_pd(ptrA,_t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_store_sd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrB,_t6);\
+_mm_storeu_pd(ptrB+2,_t7);\
+_mm_storeu_pd(ptrB+4,_t8);\
+_mm_storeu_pd(ptrB+6,_t9);\
+_mm_store_sd(ptrB+8,_t10);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -634,8 +667,66 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6,t9);
     _mm_store_sd(ptrB+8,t10);
 }
-
-
+#endif
+
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
+{\
+__m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
+__m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
+_t1          = _mm_loadu_pd(ptrA);\
+_t2          = _mm_loadu_pd(ptrA+2);\
+_t3          = _mm_loadu_pd(ptrA+4);\
+_t4          = _mm_loadu_pd(ptrA+6);\
+_t5          = _mm_loadu_pd(ptrA+8);\
+_t6          = _mm_loadu_pd(ptrA+10);\
+_t7          = _mm_loadu_pd(ptrB);\
+_t8          = _mm_loadu_pd(ptrB+2);\
+_t9          = _mm_loadu_pd(ptrB+4);\
+_t10         = _mm_loadu_pd(ptrB+6);\
+_t11         = _mm_loadu_pd(ptrB+8);\
+_t12         = _mm_loadu_pd(ptrB+10);\
+_tA          = _mm_unpacklo_pd(_x1,_y1);\
+_tB          = _mm_unpackhi_pd(_x1,_y1);\
+_tC          = _mm_unpacklo_pd(_z1,_x2);\
+_tD          = _mm_unpackhi_pd(_z1,_x2);\
+_tE          = _mm_unpacklo_pd(_y2,_z2);\
+_tF          = _mm_unpackhi_pd(_y2,_z2);\
+_tG          = _mm_unpacklo_pd(_x3,_y3);\
+_tH          = _mm_unpackhi_pd(_x3,_y3);\
+_tI          = _mm_unpacklo_pd(_z3,_x4);\
+_tJ          = _mm_unpackhi_pd(_z3,_x4);\
+_tK          = _mm_unpacklo_pd(_y4,_z4);\
+_tL          = _mm_unpackhi_pd(_y4,_z4);\
+_t1          = _mm_sub_pd(_t1,_tA);\
+_t2          = _mm_sub_pd(_t2,_tC);\
+_t3          = _mm_sub_pd(_t3,_tE);\
+_t4          = _mm_sub_pd(_t4,_tG);\
+_t5          = _mm_sub_pd(_t5,_tI);\
+_t6          = _mm_sub_pd(_t6,_tK);\
+_t7          = _mm_sub_pd(_t7,_tB);\
+_t8          = _mm_sub_pd(_t8,_tD);\
+_t9          = _mm_sub_pd(_t9,_tF);\
+_t10         = _mm_sub_pd(_t10,_tH);\
+_t11         = _mm_sub_pd(_t11,_tJ);\
+_t12         = _mm_sub_pd(_t12,_tL);\
+_mm_storeu_pd(ptrA,  _t1);\
+_mm_storeu_pd(ptrA+2,_t2);\
+_mm_storeu_pd(ptrA+4,_t3);\
+_mm_storeu_pd(ptrA+6,_t4);\
+_mm_storeu_pd(ptrA+8,_t5);\
+_mm_storeu_pd(ptrA+10,_t6);\
+_mm_storeu_pd(ptrB,  _t7);\
+_mm_storeu_pd(ptrB+2,_t8);\
+_mm_storeu_pd(ptrB+4,_t9);\
+_mm_storeu_pd(ptrB+6,_t10);\
+_mm_storeu_pd(ptrB+8,_t11);\
+_mm_storeu_pd(ptrB+10,_t12);\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
@@ -699,7 +790,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8,t11);
     _mm_storeu_pd(ptrB+10,t12);
 }
-
+#endif
 
 
 
@@ -719,6 +810,34 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fiz3);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
+    _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    fiz1 = _mm_add_sd(fiz1,_t2);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -751,8 +870,39 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
+
+#if defined (_MSC_VER) && defined(_M_IX86)
+/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
+#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
+                                              fptr,fshiftptr) \
+{\
+    __m128d _t1,_t2;\
+    fix1 = _mm_hadd_pd(fix1,fiy1);\
+    fiz1 = _mm_hadd_pd(fiz1,fix2);\
+    fiy2 = _mm_hadd_pd(fiy2,fiz2);\
+    fix3 = _mm_hadd_pd(fix3,fiy3);\
+    fiz3 = _mm_hadd_pd(fiz3,fix4);\
+    fiy4 = _mm_hadd_pd(fiy4,fiz4);\
+    _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
+    _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
+    _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
+    _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
+    _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
+    _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
+    _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
+    fix1 = _mm_add_pd(fix1,_t1);\
+    _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
+    fix3 = _mm_add_pd(fix3,_t2);\
+    fix1 = _mm_add_pd(fix1,fix3);\
+    fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
+    fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
+    fiz1 = _mm_add_sd(fiz1,fiz3);\
+    _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
+    _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
+}
+#else
+/* Real function for sane compilers */
 static gmx_inline void
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
@@ -790,8 +940,7 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-
-
+#endif
 
 static gmx_inline void
 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)