Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / kernelutil_x86_avx_128_fma_double.h
index c841834edd43e5c415c8af48f5cc50727bea0cdf..dba59bbf0e948b19edd68776a319007681bfb816 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 #ifndef _kernelutil_x86_avx_128_fma_double_h_
 #define _kernelutil_x86_avx_128_fma_double_h_
 
-#include "gromacs/simd/general_x86_avx_128_fma.h"
+#include <math.h>
 
+#include <immintrin.h>
+#ifdef _MSC_VER
+#    include <intrin.h>
+#else
+#    include <x86intrin.h>
+#endif
+
+#include "config.h"
+
+#define gmx_mm_castsi128_pd   _mm_castsi128_pd
+#define gmx_mm_extract_epi32  _mm_extract_epi32
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
+        __m128d __gmx_t1 = row0;                         \
+        row0           = _mm_unpacklo_pd(row0, row1);     \
+        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
 
-static int
+static gmx_inline int gmx_simdcall
 gmx_mm_any_lt(__m128d a, __m128d b)
 {
     return _mm_movemask_pd(_mm_cmplt_pd(a, b));
 }
 
 
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
 {
     return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
@@ -59,21 +76,21 @@ gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
 /* Load a double value from 1-2 places, merge into xmm register */
 
 
-static __m128d
+static gmx_inline __m128d gmx_simdcall
 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
                              const double * gmx_restrict ptrB)
 {
     return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
 }
 
-static __m128d
+static gmx_inline __m128d gmx_simdcall
 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
 {
     return _mm_load_sd(ptrA);
 }
 
 
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
                               double * gmx_restrict ptrB,
                               __m128d               xmm1)
@@ -85,7 +102,7 @@ gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
     _mm_store_sd(ptrB, t2);
 }
 
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 {
     _mm_store_sd(ptrA, xmm1);
@@ -93,7 +110,7 @@ gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 
 
 /* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
                                   double * gmx_restrict ptrB, __m128d xmm1)
 {
@@ -106,7 +123,7 @@ gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
     _mm_store_sd(ptrB, t1);
 }
 
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 {
     __m128d tmp;
@@ -118,7 +135,7 @@ gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
 
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
                              const double * gmx_restrict p2,
                              __m128d * gmx_restrict      c6,
@@ -133,7 +150,7 @@ gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
     *c12 = _mm_unpackhi_pd(t1, t2);
 }
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
                              __m128d * gmx_restrict      c6,
                              __m128d * gmx_restrict      c12)
@@ -143,7 +160,7 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
 }
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                          const double * gmx_restrict xyz,
                                          __m128d * gmx_restrict      x1,
@@ -166,7 +183,7 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 }
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                          const double * gmx_restrict xyz,
                                          __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -204,7 +221,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 }
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                          const double * gmx_restrict xyz,
                                          __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -249,7 +266,7 @@ gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
 
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
 {
@@ -258,7 +275,7 @@ gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
     *z            = _mm_load_sd(p1+2);
 }
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -275,7 +292,7 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
     *z3            = _mm_load_sd(p1+8);
 }
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -297,7 +314,7 @@ gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
 }
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
                                   const double * gmx_restrict ptrB,
                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
@@ -313,7 +330,7 @@ gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
     *z1          = _mm_unpacklo_pd(t3, t4);
 }
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -346,7 +363,7 @@ gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 }
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -388,7 +405,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
 
 
 /* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1)
 {
@@ -407,34 +424,8 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
 }
 
 
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _t1          = _mm_sub_pd(_t1, _x1); \
-        _t2          = _mm_sub_pd(_t2, _z1); \
-        _t3          = _mm_sub_pd(_t3, _y2); \
-        _t4          = _mm_sub_pd(_t4, _x3); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
@@ -465,36 +456,9 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+6, t4);
     _mm_store_sd(ptrA+8, t5);
 }
-#endif
 
 
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _z3          = _mm_unpacklo_pd(_z3, _x4); \
-        _y4          = _mm_unpacklo_pd(_y4, _z4); \
-        _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1, _x1 )); \
-        _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2, _z1 )); \
-        _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3, _y2 )); \
-        _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4, _x3 )); \
-        _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5, _z3 )); \
-        _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
@@ -524,10 +488,9 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5, z3 ));
     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
 }
-#endif
 
 
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1)
 {
@@ -555,55 +518,8 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
 }
 
 
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrB); \
-        _t7          = _mm_loadu_pd(ptrB+2); \
-        _t8          = _mm_loadu_pd(ptrB+4); \
-        _t9          = _mm_loadu_pd(ptrB+6); \
-        _t10         = _mm_load_sd(ptrB+8); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpackhi_pd(_z3, _z3); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _t6          = _mm_sub_pd(_t6, _tB); \
-        _t7          = _mm_sub_pd(_t7, _tD); \
-        _t8          = _mm_sub_pd(_t8, _tF); \
-        _t9          = _mm_sub_pd(_t9, _tH); \
-        _t10         = _mm_sub_sd(_t10, _tI); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrB, _t6); \
-        _mm_storeu_pd(ptrB+2, _t7); \
-        _mm_storeu_pd(ptrB+4, _t8); \
-        _mm_storeu_pd(ptrB+6, _t9); \
-        _mm_store_sd(ptrB+8, _t10); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
@@ -656,67 +572,9 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+6, t9);
     _mm_store_sd(ptrB+8, t10);
 }
-#endif
 
 
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _t7          = _mm_loadu_pd(ptrB); \
-        _t8          = _mm_loadu_pd(ptrB+2); \
-        _t9          = _mm_loadu_pd(ptrB+4); \
-        _t10         = _mm_loadu_pd(ptrB+6); \
-        _t11         = _mm_loadu_pd(ptrB+8); \
-        _t12         = _mm_loadu_pd(ptrB+10); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpacklo_pd(_z3, _x4); \
-        _tJ          = _mm_unpackhi_pd(_z3, _x4); \
-        _tK          = _mm_unpacklo_pd(_y4, _z4); \
-        _tL          = _mm_unpackhi_pd(_y4, _z4); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_pd(_t5, _tI); \
-        _t6          = _mm_sub_pd(_t6, _tK); \
-        _t7          = _mm_sub_pd(_t7, _tB); \
-        _t8          = _mm_sub_pd(_t8, _tD); \
-        _t9          = _mm_sub_pd(_t9, _tF); \
-        _t10         = _mm_sub_pd(_t10, _tH); \
-        _t11         = _mm_sub_pd(_t11, _tJ); \
-        _t12         = _mm_sub_pd(_t12, _tL); \
-        _mm_storeu_pd(ptrA,  _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_storeu_pd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrA+10, _t6); \
-        _mm_storeu_pd(ptrB,  _t7); \
-        _mm_storeu_pd(ptrB+2, _t8); \
-        _mm_storeu_pd(ptrB+4, _t9); \
-        _mm_storeu_pd(ptrB+6, _t10); \
-        _mm_storeu_pd(ptrB+8, _t11); \
-        _mm_storeu_pd(ptrB+10, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                        __m128d x1, __m128d y1, __m128d z1,
                                        __m128d x2, __m128d y2, __m128d z2,
@@ -779,10 +637,9 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
     _mm_storeu_pd(ptrB+8, t11);
     _mm_storeu_pd(ptrB+10, t12);
 }
-#endif
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       double * gmx_restrict fptr,
                                       double * gmx_restrict fshiftptr)
@@ -797,34 +654,8 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
 
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        fix1 = _mm_hadd_pd(fix1, fiy1); \
-        fiz1 = _mm_hadd_pd(fiz1, fix2); \
-        fiy2 = _mm_hadd_pd(fiy2, fiz2); \
-        fix3 = _mm_hadd_pd(fix3, fiy3); \
-        fiz3 = _mm_hadd_pd(fiz3, fiz3); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
-        _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
-        fix1  = _mm_add_pd(fix1, fix3); \
-        _t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1  = _mm_add_pd(fix1, _t1); \
-        _t2   = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
-        fiz1  = _mm_add_sd(fiz1, fiz3); \
-        fiz1  = _mm_add_sd(fiz1, _t2); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -856,40 +687,9 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-#endif
 
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        fix1 = _mm_hadd_pd(fix1, fiy1); \
-        fiz1 = _mm_hadd_pd(fiz1, fix2); \
-        fiy2 = _mm_hadd_pd(fiy2, fiz2); \
-        fix3 = _mm_hadd_pd(fix3, fiy3); \
-        fiz3 = _mm_hadd_pd(fiz3, fix4); \
-        fiy4 = _mm_hadd_pd(fiy4, fiz4); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 )); \
-        _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 )); \
-        _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
-        _t1  = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1 = _mm_add_pd(fix1, _t1); \
-        _t2  = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
-        fix3 = _mm_add_pd(fix3, _t2); \
-        fix1 = _mm_add_pd(fix1, fix3); \
-        fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
-        fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
-        fiz1 = _mm_add_sd(fiz1, fiz3); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -926,17 +726,16 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
 }
-#endif
 
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
 {
     pot1 = _mm_hadd_pd(pot1, pot1);
     _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
 }
 
-static gmx_inline void
+static gmx_inline void gmx_simdcall
 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
                       __m128d pot2, double * gmx_restrict ptrB)
 {