Enable SIMD register calling convention with gmx_simdcall

author Erik Lindahl <erik@kth.se>

Mon, 28 Jul 2014 18:33:42 +0000 (20:33 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Wed, 20 Aug 2014 16:20:57 +0000 (18:20 +0200)
author Erik Lindahl <erik@kth.se>
Mon, 28 Jul 2014 18:33:42 +0000 (20:33 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 20 Aug 2014 16:20:57 +0000 (18:20 +0200)
diff --git a/cmake/gmxTestSimd.cmake b/cmake/gmxTestSimd.cmake

index f99a43f6de8303caa41c2f5da0f4b108cdceedc9..49ea326b12b88a1a24e4ab648f75712aac0b8e20 100644 (file)
--- a/cmake/gmxTestSimd.cmake
+++ b/cmake/gmxTestSimd.cmake
@@ -293,5 +293,24 @@ if (SIMD_CHANGED AND DEFINED SIMD_STATUS_MESSAGE)
      message(STATUS "${SIMD_STATUS_MESSAGE}")
  endif()
  
+# By default, 32-bit windows cannot pass SIMD (SSE/AVX) arguments in registers,
+# and even on 64-bit (all platforms) it is only used for a handful of arguments.
+# The __vectorcall (MSVC, from MSVC2013) or __regcall (ICC) calling conventions
+# enable this, which is critical to enable 32-bit SIMD and improves performance
+# for 64-bit SIMD.
+# Check if the compiler supports one of these, and in that case set gmx_simdcall
+# to that string. If we do not have any such calling convention modifier, set it
+# to an empty string.
+if(NOT DEFINED GMX_SIMD_CALLING_CONVENTION)
+    foreach(callconv __vectorcall __regcall "")
+        set(callconv_compile_var "_callconv_${callconv}")
+        check_c_source_compiles("int ${callconv} f(int i) {return i;} int main(void) {return f(0);}" ${callconv_compile_var})
+        if(${callconv_compile_var})
+            set(GMX_SIMD_CALLING_CONVENTION "${callconv}" CACHE INTERNAL "Calling convention for SIMD routines" FORCE)
+            break()
+        endif()
+    endforeach()
+endif()
+
  endmacro()
  
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 897dae65df052fb935ebf1db01222fcae3deca94..4aa43dc6121e4166fc097fa00ef82fec439c3acf 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -123,6 +123,9 @@
  /* String for SIMD instruction choice (for writing to log files and stdout) */
  #define GMX_SIMD_STRING "@GMX_SIMD@"
  
+/* Calling convention string (if any) for routines with SIMD variable args */
+#define gmx_simdcall @GMX_SIMD_CALLING_CONVENTION@
+
  /* Integer byte order is big endian. */
  #cmakedefine GMX_INTEGER_BIG_ENDIAN
  
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h

index d6772ac8192682847ae77bbca4e94eec30341ab8..bc4b3a6e78dc0da93d43348925527e013853cd73 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
@@ -52,14 +52,14 @@
          row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
  }
  
-static int
+static gmx_inline int gmx_simdcall
  gmx_mm_any_lt(__m128d a, __m128d b)
  {
      return _mm_movemask_pd(_mm_cmplt_pd(a, b));
  }
  
  
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
  {
      return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
@@ -73,21 +73,21 @@ gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
  /* Load a double value from 1-2 places, merge into xmm register */
  
  
-static __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
                               const double * gmx_restrict ptrB)
  {
      return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
  }
  
-static __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
  {
      return _mm_load_sd(ptrA);
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
                                double * gmx_restrict ptrB,
                                __m128d               xmm1)
@@ -99,7 +99,7 @@ gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrB, t2);
  }
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
      _mm_store_sd(ptrA, xmm1);
@@ -107,7 +107,7 @@ gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  
  
  /* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
                                    double * gmx_restrict ptrB, __m128d xmm1)
  {
@@ -120,7 +120,7 @@ gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrB, t1);
  }
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
      __m128d tmp;
@@ -132,7 +132,7 @@ gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
                               const double * gmx_restrict p2,
                               __m128d * gmx_restrict      c6,
@@ -147,7 +147,7 @@ gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
      *c12 = _mm_unpackhi_pd(t1, t2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
                               __m128d * gmx_restrict      c6,
                               __m128d * gmx_restrict      c12)
@@ -157,7 +157,7 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict      x1,
@@ -180,7 +180,7 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -218,7 +218,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -263,7 +263,7 @@ gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
  {
@@ -272,7 +272,7 @@ gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z            = _mm_load_sd(p1+2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -289,7 +289,7 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z3            = _mm_load_sd(p1+8);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -311,7 +311,7 @@ gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
                                    const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
@@ -327,7 +327,7 @@ gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
      *z1          = _mm_unpacklo_pd(t3, t4);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -360,7 +360,7 @@ gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -402,7 +402,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  
  
  /* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
@@ -421,34 +421,8 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
  }
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _t1          = _mm_sub_pd(_t1, _x1); \
-        _t2          = _mm_sub_pd(_t2, _z1); \
-        _t3          = _mm_sub_pd(_t3, _y2); \
-        _t4          = _mm_sub_pd(_t4, _x3); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -479,36 +453,9 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+6, t4);
      _mm_store_sd(ptrA+8, t5);
  }
-#endif
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _z3          = _mm_unpacklo_pd(_z3, _x4); \
-        _y4          = _mm_unpacklo_pd(_y4, _z4); \
-        _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1, _x1 )); \
-        _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2, _z1 )); \
-        _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3, _y2 )); \
-        _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4, _x3 )); \
-        _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5, _z3 )); \
-        _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -538,10 +485,9 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5, z3 ));
      _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
  }
-#endif
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
@@ -569,55 +515,8 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
  }
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrB); \
-        _t7          = _mm_loadu_pd(ptrB+2); \
-        _t8          = _mm_loadu_pd(ptrB+4); \
-        _t9          = _mm_loadu_pd(ptrB+6); \
-        _t10         = _mm_load_sd(ptrB+8); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpackhi_pd(_z3, _z3); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _t6          = _mm_sub_pd(_t6, _tB); \
-        _t7          = _mm_sub_pd(_t7, _tD); \
-        _t8          = _mm_sub_pd(_t8, _tF); \
-        _t9          = _mm_sub_pd(_t9, _tH); \
-        _t10         = _mm_sub_sd(_t10, _tI); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrB, _t6); \
-        _mm_storeu_pd(ptrB+2, _t7); \
-        _mm_storeu_pd(ptrB+4, _t8); \
-        _mm_storeu_pd(ptrB+6, _t9); \
-        _mm_store_sd(ptrB+8, _t10); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -670,67 +569,9 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+6, t9);
      _mm_store_sd(ptrB+8, t10);
  }
-#endif
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _t7          = _mm_loadu_pd(ptrB); \
-        _t8          = _mm_loadu_pd(ptrB+2); \
-        _t9          = _mm_loadu_pd(ptrB+4); \
-        _t10         = _mm_loadu_pd(ptrB+6); \
-        _t11         = _mm_loadu_pd(ptrB+8); \
-        _t12         = _mm_loadu_pd(ptrB+10); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpacklo_pd(_z3, _x4); \
-        _tJ          = _mm_unpackhi_pd(_z3, _x4); \
-        _tK          = _mm_unpacklo_pd(_y4, _z4); \
-        _tL          = _mm_unpackhi_pd(_y4, _z4); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_pd(_t5, _tI); \
-        _t6          = _mm_sub_pd(_t6, _tK); \
-        _t7          = _mm_sub_pd(_t7, _tB); \
-        _t8          = _mm_sub_pd(_t8, _tD); \
-        _t9          = _mm_sub_pd(_t9, _tF); \
-        _t10         = _mm_sub_pd(_t10, _tH); \
-        _t11         = _mm_sub_pd(_t11, _tJ); \
-        _t12         = _mm_sub_pd(_t12, _tL); \
-        _mm_storeu_pd(ptrA,  _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_storeu_pd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrA+10, _t6); \
-        _mm_storeu_pd(ptrB,  _t7); \
-        _mm_storeu_pd(ptrB+2, _t8); \
-        _mm_storeu_pd(ptrB+4, _t9); \
-        _mm_storeu_pd(ptrB+6, _t10); \
-        _mm_storeu_pd(ptrB+8, _t11); \
-        _mm_storeu_pd(ptrB+10, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -796,7 +637,7 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
  #endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        double * gmx_restrict fptr,
                                        double * gmx_restrict fshiftptr)
@@ -811,34 +652,8 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        fix1 = _mm_hadd_pd(fix1, fiy1); \
-        fiz1 = _mm_hadd_pd(fiz1, fix2); \
-        fiy2 = _mm_hadd_pd(fiy2, fiz2); \
-        fix3 = _mm_hadd_pd(fix3, fiy3); \
-        fiz3 = _mm_hadd_pd(fiz3, fiz3); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
-        _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
-        fix1  = _mm_add_pd(fix1, fix3); \
-        _t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1  = _mm_add_pd(fix1, _t1); \
-        _t2   = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
-        fiz1  = _mm_add_sd(fiz1, fiz3); \
-        fiz1  = _mm_add_sd(fiz1, _t2); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
                                        __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -870,40 +685,9 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-#endif
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        fix1 = _mm_hadd_pd(fix1, fiy1); \
-        fiz1 = _mm_hadd_pd(fiz1, fix2); \
-        fiy2 = _mm_hadd_pd(fiy2, fiz2); \
-        fix3 = _mm_hadd_pd(fix3, fiy3); \
-        fiz3 = _mm_hadd_pd(fiz3, fix4); \
-        fiy4 = _mm_hadd_pd(fiy4, fiz4); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 )); \
-        _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 )); \
-        _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
-        _t1  = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1 = _mm_add_pd(fix1, _t1); \
-        _t2  = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
-        fix3 = _mm_add_pd(fix3, _t2); \
-        fix1 = _mm_add_pd(fix1, fix3); \
-        fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
-        fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
-        fiz1 = _mm_add_sd(fiz1, fiz3); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
                                        __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -943,14 +727,14 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
  #endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
  {
      pot1 = _mm_hadd_pd(pot1, pot1);
      _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
                        __m128d pot2, double * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h

index b9ef14ef6a41c3d1489912ae6b640362b8bf5870..9af5bb410a96b90af5e454907d9937d46d2e130b 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
@@ -63,13 +63,13 @@
  /* Normal sum of four xmm registers */
  #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
  
-static gmx_inline int
+static gmx_inline int gmx_simdcall
  gmx_mm_any_lt(__m128 a, __m128 b)
  {
      return _mm_movemask_ps(_mm_cmplt_ps(a, b));
  }
  
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
  {
      return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
@@ -77,7 +77,7 @@ gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
  
  /* Load a single value from 1-4 places, merge into xmm register */
  
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
                               const float * gmx_restrict ptrB,
                               const float * gmx_restrict ptrC,
@@ -91,7 +91,7 @@ gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
                                float * gmx_restrict ptrB,
                                float * gmx_restrict ptrC,
@@ -109,7 +109,7 @@ gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
                                    float * gmx_restrict ptrB,
                                    float * gmx_restrict ptrC,
@@ -123,7 +123,7 @@ gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                               const float * gmx_restrict p2,
                               const float * gmx_restrict p3,
@@ -144,7 +144,7 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict      x1,
@@ -166,7 +166,7 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -204,7 +204,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -246,7 +246,7 @@ gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                    __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
@@ -264,7 +264,7 @@ gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float *
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                    __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -300,7 +300,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float *
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                    __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -339,7 +339,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float *
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1)
@@ -374,73 +374,7 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
  }
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                               _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
-        __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
-        _t13         = _mm_unpackhi_ps(_x1, _y1); \
-        _x1          = _mm_unpacklo_ps(_x1, _y1); \
-        _t14         = _mm_unpackhi_ps(_z1, _x2); \
-        _z1          = _mm_unpacklo_ps(_z1, _x2); \
-        _t15         = _mm_unpackhi_ps(_y2, _z2); \
-        _y2          = _mm_unpacklo_ps(_y2, _z2); \
-        _t16         = _mm_unpackhi_ps(_x3, _y3); \
-        _x3          = _mm_unpacklo_ps(_x3, _y3); \
-        _t17         = _mm_permute_ps(_z3, _MM_SHUFFLE(0, 0, 0, 1)); \
-        _t18         = _mm_movehl_ps(_z3, _z3); \
-        _t19         = _mm_permute_ps(_t18, _MM_SHUFFLE(0, 0, 0, 1)); \
-        _t20         = _mm_movelh_ps(_x1, _z1); \
-        _t21         = _mm_movehl_ps(_z1, _x1); \
-        _t22         = _mm_movelh_ps(_t13, _t14); \
-        _t14         = _mm_movehl_ps(_t14, _t13); \
-        _t23         = _mm_movelh_ps(_y2, _x3); \
-        _t24         = _mm_movehl_ps(_x3, _y2); \
-        _t25         = _mm_movelh_ps(_t15, _t16); \
-        _t16         = _mm_movehl_ps(_t16, _t15); \
-        _t1          = _mm_loadu_ps(ptrA); \
-        _t2          = _mm_loadu_ps(ptrA+4); \
-        _t3          = _mm_load_ss(ptrA+8); \
-        _t1          = _mm_sub_ps(_t1, _t20); \
-        _t2          = _mm_sub_ps(_t2, _t23); \
-        _t3          = _mm_sub_ss(_t3, _z3); \
-        _mm_storeu_ps(ptrA, _t1); \
-        _mm_storeu_ps(ptrA+4, _t2); \
-        _mm_store_ss(ptrA+8, _t3); \
-        _t4          = _mm_loadu_ps(ptrB); \
-        _t5          = _mm_loadu_ps(ptrB+4); \
-        _t6          = _mm_load_ss(ptrB+8); \
-        _t4          = _mm_sub_ps(_t4, _t21); \
-        _t5          = _mm_sub_ps(_t5, _t24); \
-        _t6          = _mm_sub_ss(_t6, _t17); \
-        _mm_storeu_ps(ptrB, _t4); \
-        _mm_storeu_ps(ptrB+4, _t5); \
-        _mm_store_ss(ptrB+8, _t6); \
-        _t7          = _mm_loadu_ps(ptrC); \
-        _t8          = _mm_loadu_ps(ptrC+4); \
-        _t9          = _mm_load_ss(ptrC+8); \
-        _t7          = _mm_sub_ps(_t7, _t22); \
-        _t8          = _mm_sub_ps(_t8, _t25); \
-        _t9          = _mm_sub_ss(_t9, _t18); \
-        _mm_storeu_ps(ptrC, _t7); \
-        _mm_storeu_ps(ptrC+4, _t8); \
-        _mm_store_ss(ptrC+8, _t9); \
-        _t10         = _mm_loadu_ps(ptrD); \
-        _t11         = _mm_loadu_ps(ptrD+4); \
-        _t12         = _mm_load_ss(ptrD+8); \
-        _t10         = _mm_sub_ps(_t10, _t14); \
-        _t11         = _mm_sub_ps(_t11, _t16); \
-        _t12         = _mm_sub_ss(_t12, _t19); \
-        _mm_storeu_ps(ptrD, _t10); \
-        _mm_storeu_ps(ptrD+4, _t11); \
-        _mm_store_ss(ptrD+8, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
@@ -506,80 +440,9 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4, t11);
      _mm_store_ss(ptrD+8, t12);
  }
-#endif
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                               _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
-        __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
-        __m128 _t23, _t24; \
-        _t13         = _mm_unpackhi_ps(_x1, _y1); \
-        _x1          = _mm_unpacklo_ps(_x1, _y1); \
-        _t14         = _mm_unpackhi_ps(_z1, _x2); \
-        _z1          = _mm_unpacklo_ps(_z1, _x2); \
-        _t15         = _mm_unpackhi_ps(_y2, _z2); \
-        _y2          = _mm_unpacklo_ps(_y2, _z2); \
-        _t16         = _mm_unpackhi_ps(_x3, _y3); \
-        _x3          = _mm_unpacklo_ps(_x3, _y3); \
-        _t17         = _mm_unpackhi_ps(_z3, _x4); \
-        _z3          = _mm_unpacklo_ps(_z3, _x4); \
-        _t18         = _mm_unpackhi_ps(_y4, _z4); \
-        _y4          = _mm_unpacklo_ps(_y4, _z4); \
-        _t19         = _mm_movelh_ps(_x1, _z1); \
-        _z1          = _mm_movehl_ps(_z1, _x1); \
-        _t20         = _mm_movelh_ps(_t13, _t14); \
-        _t14         = _mm_movehl_ps(_t14, _t13); \
-        _t21         = _mm_movelh_ps(_y2, _x3); \
-        _x3          = _mm_movehl_ps(_x3, _y2); \
-        _t22         = _mm_movelh_ps(_t15, _t16); \
-        _t16         = _mm_movehl_ps(_t16, _t15); \
-        _t23         = _mm_movelh_ps(_z3, _y4); \
-        _y4          = _mm_movehl_ps(_y4, _z3); \
-        _t24         = _mm_movelh_ps(_t17, _t18); \
-        _t18         = _mm_movehl_ps(_t18, _t17); \
-        _t1          = _mm_loadu_ps(ptrA); \
-        _t2          = _mm_loadu_ps(ptrA+4); \
-        _t3          = _mm_loadu_ps(ptrA+8); \
-        _t1          = _mm_sub_ps(_t1, _t19); \
-        _t2          = _mm_sub_ps(_t2, _t21); \
-        _t3          = _mm_sub_ps(_t3, _t23); \
-        _mm_storeu_ps(ptrA, _t1); \
-        _mm_storeu_ps(ptrA+4, _t2); \
-        _mm_storeu_ps(ptrA+8, _t3); \
-        _t4          = _mm_loadu_ps(ptrB); \
-        _t5          = _mm_loadu_ps(ptrB+4); \
-        _t6          = _mm_loadu_ps(ptrB+8); \
-        _t4          = _mm_sub_ps(_t4, _z1); \
-        _t5          = _mm_sub_ps(_t5, _x3); \
-        _t6          = _mm_sub_ps(_t6, _y4); \
-        _mm_storeu_ps(ptrB, _t4); \
-        _mm_storeu_ps(ptrB+4, _t5); \
-        _mm_storeu_ps(ptrB+8, _t6); \
-        _t7          = _mm_loadu_ps(ptrC); \
-        _t8          = _mm_loadu_ps(ptrC+4); \
-        _t9          = _mm_loadu_ps(ptrC+8); \
-        _t7          = _mm_sub_ps(_t7, _t20); \
-        _t8          = _mm_sub_ps(_t8, _t22); \
-        _t9          = _mm_sub_ps(_t9, _t24); \
-        _mm_storeu_ps(ptrC, _t7); \
-        _mm_storeu_ps(ptrC+4, _t8); \
-        _mm_storeu_ps(ptrC+8, _t9); \
-        _t10         = _mm_loadu_ps(ptrD); \
-        _t11         = _mm_loadu_ps(ptrD+4); \
-        _t12         = _mm_loadu_ps(ptrD+8); \
-        _t10         = _mm_sub_ps(_t10, _t14); \
-        _t11         = _mm_sub_ps(_t11, _t16); \
-        _t12         = _mm_sub_ps(_t12, _t18); \
-        _mm_storeu_ps(ptrD, _t10); \
-        _mm_storeu_ps(ptrD+4, _t11); \
-        _mm_storeu_ps(ptrD+8, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
@@ -651,9 +514,9 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4, t11);
      _mm_storeu_ps(ptrD+8, t12);
  }
-#endif
  
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        float * gmx_restrict fptr,
                                        float * gmx_restrict fshiftptr)
@@ -679,39 +542,8 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128 _t1, _t2, _t3, _t4; \
-\
-        fix1 = _mm_hadd_ps(fix1, fiy1); \
-        fiz1 = _mm_hadd_ps(fiz1, fix2); \
-        fiy2 = _mm_hadd_ps(fiy2, fiz2); \
-        fix3 = _mm_hadd_ps(fix3, fiy3); \
-        fiz3 = _mm_hadd_ps(fiz3, fiz3); \
-        fix1 = _mm_hadd_ps(fix1, fiz1); \
-        fiy2 = _mm_hadd_ps(fiy2, fix3); \
-        fiz3 = _mm_hadd_ps(fiz3, fiz3); \
-        _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
-        _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
-        _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
-        _t4 = _mm_load_ss(fshiftptr+2); \
-        _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
-        _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
-        _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
-        _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
-        _t3 = _mm_permute_ps(_t3, _MM_SHUFFLE(1, 2, 0, 0)); \
-        _t1 = _mm_add_ps(_t1, _t2); \
-        _t3 = _mm_add_ps(_t3, _t4); \
-        _t1 = _mm_add_ps(_t1, _t3); \
-        _mm_store_ss(fshiftptr+2, _t1); \
-        _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
                                        __m128 fix3, __m128 fiy3, __m128 fiz3,
@@ -749,44 +581,9 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2, t1);
      _mm_storeh_pi((__m64 *)(fshiftptr), t1);
  }
-#endif
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5; \
-\
-        fix1 = _mm_hadd_ps(fix1, fiy1); \
-        fiz1 = _mm_hadd_ps(fiz1, fix2); \
-        fiy2 = _mm_hadd_ps(fiy2, fiz2); \
-        fix3 = _mm_hadd_ps(fix3, fiy3); \
-        fiz3 = _mm_hadd_ps(fiz3, fix4); \
-        fiy4 = _mm_hadd_ps(fiy4, fiz4); \
-        fix1 = _mm_hadd_ps(fix1, fiz1); \
-        fiy2 = _mm_hadd_ps(fiy2, fix3); \
-        fiz3 = _mm_hadd_ps(fiz3, fiy4); \
-        _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
-        _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
-        _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
-        _t5 = _mm_load_ss(fshiftptr+2); \
-        _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
-        _t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
-        _t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
-        _t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
-        _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
-        _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
-        _t1 = _mm_add_ps(_t1, _t2); \
-        _t3 = _mm_add_ps(_t3, _t4); \
-        _t1 = _mm_add_ps(_t1, _t3); \
-        _t5 = _mm_add_ps(_t5, _t1); \
-        _mm_store_ss(fshiftptr+2, _t5); \
-        _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
                                        __m128 fix3, __m128 fiy3, __m128 fiz3,
@@ -831,7 +628,7 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
  #endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
  {
      pot1 = _mm_hadd_ps(pot1, pot1);
@@ -839,7 +636,7 @@ gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
      _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
                        __m128 pot2, float * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

index 9c9ef1e9084ba6e84194b164bad77902bb383bc7..e77e39c1d3528350630ab1573ba84f6e95e77f9e 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
@@ -56,37 +56,37 @@
  
  #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
  
-static __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
  {
      return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
  }
  
-static __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
  {
      return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
  }
  
-static __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_mm256_set_m128d(__m128d hi, __m128d lo)
  {
      return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
  }
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_mm256_set_m128(__m128 hi, __m128 lo)
  {
      return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
  }
  
-static int
+static gmx_inline int gmx_simdcall
  gmx_mm256_any_lt(__m256d a, __m256d b)
  {
      return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
  }
  
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  {
      return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
@@ -97,13 +97,13 @@ gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
  
  
  /* Load a single value from 1-4 places, merge into xmm register */
-static __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
  {
      return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
  }
  
-static __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
                                  const double * gmx_restrict ptrB)
  {
@@ -116,7 +116,7 @@ gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
  }
  
  
-static __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                  const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
  {
@@ -129,14 +129,14 @@ gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double *
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
  {
      _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
  {
      __m256d t2;
@@ -149,7 +149,7 @@ gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restri
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                   double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
  {
@@ -168,7 +168,7 @@ gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restri
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
  {
      __m128d t1;
@@ -180,7 +180,7 @@ gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
  {
      __m128d t1, t2;
@@ -196,7 +196,7 @@ gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_re
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                       double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
  {
@@ -220,7 +220,7 @@ gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_re
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
  {
      *c6     = _mm256_castpd128_pd256(_mm_load_sd(p1));
@@ -228,7 +228,7 @@ gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
  {
      __m128d t1, t2, t3;
@@ -241,7 +241,7 @@ gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * g
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
                                  const double * gmx_restrict p3, const double * gmx_restrict p4,
                                  __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
@@ -256,7 +256,7 @@ gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * g
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                              const double * gmx_restrict xyz,
                                              __m256d * gmx_restrict      x1,
@@ -283,7 +283,7 @@ gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shif
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                              const double * gmx_restrict xyz,
                                              __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
@@ -330,7 +330,7 @@ gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shif
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                              const double * gmx_restrict xyz,
                                              __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
@@ -386,7 +386,7 @@ gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shif
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                       __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
  {
@@ -399,7 +399,7 @@ gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                       __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
                                       __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
@@ -422,7 +422,7 @@ gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z3           = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
  }
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                       __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
                                       __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
@@ -455,7 +455,7 @@ gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                       const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
                                       __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
@@ -479,7 +479,7 @@ gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                       const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
                                       __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
@@ -527,7 +527,7 @@ gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                       const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
                                       __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
@@ -581,7 +581,7 @@ gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const dou
  
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                            double * gmx_restrict ptrC, double * gmx_restrict ptrD,
                                            __m256d x1, __m256d y1, __m256d z1)
@@ -619,70 +619,8 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
  
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
-                                                  _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128d _tA, _tB, _tC, _tD, _tE; \
-        _t1          = _mm256_loadu_pd(ptrA); \
-        _t2          = _mm256_loadu_pd(ptrB); \
-        _t3          = _mm256_loadu_pd(ptrC); \
-        _t4          = _mm256_loadu_pd(ptrD); \
-        _t5          = _mm256_loadu_pd(ptrA+4); \
-        _t6          = _mm256_loadu_pd(ptrB+4); \
-        _t7          = _mm256_loadu_pd(ptrC+4); \
-        _t8          = _mm256_loadu_pd(ptrD+4); \
-        _tA          = _mm_load_sd(ptrA+8); \
-        _tB          = _mm_load_sd(ptrB+8); \
-        _tC          = _mm_load_sd(ptrC+8); \
-        _tD          = _mm_load_sd(ptrD+8); \
-        _t9          = _mm256_unpacklo_pd(_x1, _y1); \
-        _x1          = _mm256_unpackhi_pd(_x1, _y1); \
-        _y1          = _mm256_unpacklo_pd(_z1, _x2); \
-        _z1          = _mm256_unpackhi_pd(_z1, _x2); \
-        _x2          = _mm256_unpacklo_pd(_y2, _z2); \
-        _y2          = _mm256_unpackhi_pd(_y2, _z2); \
-        _z2          = _mm256_unpacklo_pd(_x3, _y3); \
-        _x3          = _mm256_unpackhi_pd(_x3, _y3); \
-        _t10         = gmx_mm256_unpack128lo_pd(_t9, _y1); \
-        _y3          = gmx_mm256_unpack128hi_pd(_t9, _y1); \
-        _t9          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
-        _y1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
-        _x1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
-        _z1          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
-        _x2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
-        _z2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
-        _t1          = _mm256_sub_pd(_t1, _t10); \
-        _t2          = _mm256_sub_pd(_t2, _t9); \
-        _t3          = _mm256_sub_pd(_t3, _y3); \
-        _t4          = _mm256_sub_pd(_t4, _y1); \
-        _t5          = _mm256_sub_pd(_t5, _x1); \
-        _t6          = _mm256_sub_pd(_t6, _x2); \
-        _t7          = _mm256_sub_pd(_t7, _z1); \
-        _t8          = _mm256_sub_pd(_t8, _z2); \
-        _tA          = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
-        _tB          = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
-        _tE          = _mm256_extractf128_pd(_z3, 0x1); \
-        _tC          = _mm_sub_sd(_tC, _tE); \
-        _tD          = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
-        _mm256_storeu_pd(ptrA, _t1); \
-        _mm256_storeu_pd(ptrB, _t2); \
-        _mm256_storeu_pd(ptrC, _t3); \
-        _mm256_storeu_pd(ptrD, _t4); \
-        _mm256_storeu_pd(ptrA+4, _t5); \
-        _mm256_storeu_pd(ptrB+4, _t6); \
-        _mm256_storeu_pd(ptrC+4, _t7); \
-        _mm256_storeu_pd(ptrD+4, _t8); \
-        _mm_store_sd(ptrA+8, _tA); \
-        _mm_store_sd(ptrB+8, _tB); \
-        _mm_store_sd(ptrC+8, _tC); \
-        _mm_store_sd(ptrD+8, _tD); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                            double * gmx_restrict ptrC, double * gmx_restrict ptrD,
                                            __m256d x1, __m256d y1, __m256d z1,
@@ -758,79 +696,9 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
      _mm_store_sd(ptrC+8, tC);
      _mm_store_sd(ptrD+8, tD);
  }
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
-                                                  _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
-        __m128d _tA, _tB, _tC, _tD, _tE; \
-        _t1          = _mm256_loadu_pd(ptrA); \
-        _t2          = _mm256_loadu_pd(ptrB); \
-        _t3          = _mm256_loadu_pd(ptrC); \
-        _t4          = _mm256_loadu_pd(ptrD); \
-        _t5          = _mm256_loadu_pd(ptrA+4); \
-        _t6          = _mm256_loadu_pd(ptrB+4); \
-        _t7          = _mm256_loadu_pd(ptrC+4); \
-        _t8          = _mm256_loadu_pd(ptrD+4); \
-        _t9          = _mm256_loadu_pd(ptrA+8); \
-        _t10         = _mm256_loadu_pd(ptrB+8); \
-        _t11         = _mm256_loadu_pd(ptrC+8); \
-        _t12         = _mm256_loadu_pd(ptrD+8); \
-        _t13         = _mm256_unpacklo_pd(_x1, _y1); \
-        _x1          = _mm256_unpackhi_pd(_x1, _y1); \
-        _y1          = _mm256_unpacklo_pd(_z1, _x2); \
-        _z1          = _mm256_unpackhi_pd(_z1, _x2); \
-        _x2          = _mm256_unpacklo_pd(_y2, _z2); \
-        _y2          = _mm256_unpackhi_pd(_y2, _z2); \
-        _z2          = _mm256_unpacklo_pd(_x3, _y3); \
-        _x3          = _mm256_unpackhi_pd(_x3, _y3); \
-        _y3          = _mm256_unpacklo_pd(_z3, _x4); \
-        _z3          = _mm256_unpackhi_pd(_z3, _x4); \
-        _x4          = _mm256_unpacklo_pd(_y4, _z4); \
-        _y4          = _mm256_unpackhi_pd(_y4, _z4); \
-        _z4          = gmx_mm256_unpack128lo_pd(_t13, _y1); \
-        _t13         = gmx_mm256_unpack128hi_pd(_t13, _y1); \
-        _y1          = gmx_mm256_unpack128lo_pd(_x1, _z1); \
-        _x1          = gmx_mm256_unpack128hi_pd(_x1, _z1); \
-        _z1          = gmx_mm256_unpack128lo_pd(_x2, _z2); \
-        _x2          = gmx_mm256_unpack128hi_pd(_x2, _z2); \
-        _z2          = gmx_mm256_unpack128lo_pd(_y2, _x3); \
-        _y2          = gmx_mm256_unpack128hi_pd(_y2, _x3); \
-        _x3          = gmx_mm256_unpack128lo_pd(_y3, _x4); \
-        _y3          = gmx_mm256_unpack128hi_pd(_y3, _x4); \
-        _x4          = gmx_mm256_unpack128lo_pd(_z3, _y4); \
-        _z3          = gmx_mm256_unpack128hi_pd(_z3, _y4); \
-        _t1          = _mm256_sub_pd(_t1, _z4); \
-        _t2          = _mm256_sub_pd(_t2, _y1); \
-        _t3          = _mm256_sub_pd(_t3, _t13); \
-        _t4          = _mm256_sub_pd(_t4, _x1); \
-        _t5          = _mm256_sub_pd(_t5, _z1); \
-        _t6          = _mm256_sub_pd(_t6, _z2); \
-        _t7          = _mm256_sub_pd(_t7, _x2); \
-        _t8          = _mm256_sub_pd(_t8, _y2); \
-        _t9          = _mm256_sub_pd(_t9, _x3); \
-        _t10         = _mm256_sub_pd(_t10, _x4); \
-        _t11         = _mm256_sub_pd(_t11, _y3); \
-        _t12         = _mm256_sub_pd(_t12, _z3); \
-        _mm256_storeu_pd(ptrA, _t1); \
-        _mm256_storeu_pd(ptrB, _t2); \
-        _mm256_storeu_pd(ptrC, _t3); \
-        _mm256_storeu_pd(ptrD, _t4); \
-        _mm256_storeu_pd(ptrA+4, _t5); \
-        _mm256_storeu_pd(ptrB+4, _t6); \
-        _mm256_storeu_pd(ptrC+4, _t7); \
-        _mm256_storeu_pd(ptrD+4, _t8); \
-        _mm256_storeu_pd(ptrA+8, _t9); \
-        _mm256_storeu_pd(ptrB+8, _t10); \
-        _mm256_storeu_pd(ptrC+8, _t11); \
-        _mm256_storeu_pd(ptrD+8, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                            double * gmx_restrict ptrC, double * gmx_restrict ptrD,
                                            __m256d x1, __m256d y1, __m256d z1,
@@ -907,13 +775,10 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * g
      _mm256_storeu_pd(ptrC+8, t11);
      _mm256_storeu_pd(ptrD+8, t12);
  }
-#endif
-
-
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
                                           double * gmx_restrict fptr,
                                           double * gmx_restrict fshiftptr)
@@ -941,52 +806,8 @@ gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
  
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                                 fptr, fshiftptr) \
-    { \
-        __m256d _t1, _t2, _t3, _t4; \
-        __m128d _tz3, _tA, _tB, _tC, _tD; \
-        fix1  = _mm256_hadd_pd(fix1, fiy1); \
-        fiz1  = _mm256_hadd_pd(fiz1, fix2); \
-        fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
-        fix3  = _mm256_hadd_pd(fix3, fiy3); \
-        fiz3  = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
-        _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
-        _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
-        _t1   = _mm256_add_pd(_t1, _t2); \
-        _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
-        _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
-        _t3   = _mm256_add_pd(_t3, _t4); \
-        _tz3  = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
-        _t2   = _mm256_loadu_pd(fptr); \
-        _t4   = _mm256_loadu_pd(fptr+4); \
-        _tA   = _mm_load_sd(fptr+8); \
-        _t2   = _mm256_add_pd(_t2, _t1); \
-        _t4   = _mm256_add_pd(_t4, _t3); \
-        _tA   = _mm_add_sd(_tA, _tz3); \
-        _mm256_storeu_pd(fptr, _t2); \
-        _mm256_storeu_pd(fptr+4, _t4); \
-        _mm_store_sd(fptr+8, _tA); \
-        _tB   = _mm256_extractf128_pd(_t1, 0x1); \
-        _tC   = _mm256_extractf128_pd(_t3, 0x1); \
-        _tz3  = _mm_add_sd(_tz3, _tB); \
-        _tD   = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
-        _tz3  = _mm_add_sd(_tz3, _tD); \
-        _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
-        _tD   = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
-        _tC   = _mm_add_pd(_tC, _tD); \
-        _tA   = _mm_loadu_pd(fshiftptr); \
-        _tB   = _mm_load_sd(fshiftptr+2); \
-        _tA   = _mm_add_pd(_tA, _tC); \
-        _tB   = _mm_add_sd(_tB, _tz3); \
-        _mm_storeu_pd(fshiftptr, _tA); \
-        _mm_store_sd(fshiftptr+2, _tB); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
                                           __m256d fix2, __m256d fiy2, __m256d fiz2,
                                           __m256d fix3, __m256d fiy3, __m256d fiz3,
@@ -1049,60 +870,9 @@ gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
      _mm_storeu_pd(fshiftptr, tA);
      _mm_store_sd(fshiftptr+2, tB);
  }
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                                 fptr, fshiftptr) \
-    { \
-        __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
-        __m128d _tA, _tB, _tC, _tD; \
-        fix1  = _mm256_hadd_pd(fix1, fiy1); \
-        fiz1  = _mm256_hadd_pd(fiz1, fix2); \
-        fiy2  = _mm256_hadd_pd(fiy2, fiz2); \
-        fix3  = _mm256_hadd_pd(fix3, fiy3); \
-        fiz3  = _mm256_hadd_pd(fiz3, fix4); \
-        fiy4  = _mm256_hadd_pd(fiy4, fiz4); \
-        _t1   = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
-        _t2   = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
-        _t1   = _mm256_add_pd(_t1, _t2); \
-        _t3   = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
-        _t4   = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
-        _t3   = _mm256_add_pd(_t3, _t4); \
-        _t5   = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
-        _t6   = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
-        _t5   = _mm256_add_pd(_t5, _t6); \
-        _t2   = _mm256_loadu_pd(fptr); \
-        _t4   = _mm256_loadu_pd(fptr+4); \
-        _t6   = _mm256_loadu_pd(fptr+8); \
-        _t2   = _mm256_add_pd(_t2, _t1); \
-        _t4   = _mm256_add_pd(_t4, _t3); \
-        _t6   = _mm256_add_pd(_t6, _t5); \
-        _mm256_storeu_pd(fptr, _t2); \
-        _mm256_storeu_pd(fptr+4, _t4); \
-        _mm256_storeu_pd(fptr+8, _t6); \
-        _tA   = _mm256_extractf128_pd(_t1, 0x1); \
-        _tB   = _mm256_extractf128_pd(_t3, 0x1); \
-        _tC   = _mm256_extractf128_pd(_t5, 0x1); \
-        _tB   = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
-        _tA   = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
-        _tC   = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
-        _tD   = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
-        _tB   = _mm_add_pd(_tB, _tD); \
-        _tC   = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
-        _tC   = _mm_add_sd(_tC, _tA); \
-        _tA   = _mm_loadu_pd(fshiftptr); \
-        _tD   = _mm_load_sd(fshiftptr+2); \
-        _tA   = _mm_add_pd(_tA, _tB); \
-        _tD   = _mm_add_sd(_tD, _tC); \
-        _mm_storeu_pd(fshiftptr, _tA); \
-        _mm_store_sd(fshiftptr+2, _tD); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
                                           __m256d fix2, __m256d fiy2, __m256d fiz2,
                                           __m256d fix3, __m256d fiy3, __m256d fiz3,
@@ -1171,11 +941,9 @@ gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz
      _mm_storeu_pd(fshiftptr, tA);
      _mm_store_sd(fshiftptr+2, tD);
  }
-#endif
-
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
  {
      __m128d t1;
@@ -1187,7 +955,7 @@ gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
      _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
  }
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
                           __m256d pot2, double * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h

index d4f041b119e5755a7ccf02075c82f7d892735d9b..171e1653ecbc0c36e4857d3d394d316bc66614e9 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
@@ -37,19 +37,19 @@
  
  #define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
  {
      return _mm256_permute2f128_ps(xmm1, xmm2, 0x20);
  }
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
  {
      return _mm256_permute2f128_ps(xmm1, xmm2, 0x31);
  }
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_mm256_set_m128(__m128 hi, __m128 lo)
  {
      return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
@@ -83,7 +83,7 @@ gmx_mm256_set_m128(__m128 hi, __m128 lo)
  }
  
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
  {
      return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
@@ -93,14 +93,14 @@ gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
  #define gmx_mm256_sum4_ps(t0, t1, t2, t3)  _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
  
  
-static gmx_inline int
+static gmx_inline int gmx_simdcall
  gmx_mm256_any_lt(__m256 a, __m256 b)
  {
      return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_LT_OQ));
  }
  
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                  const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
  {
@@ -112,7 +112,7 @@ gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * g
  }
  
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                  const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                  const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
@@ -128,7 +128,7 @@ gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * g
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                   float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
  {
@@ -144,7 +144,7 @@ gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                   float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                   float * gmx_restrict ptrE, float * gmx_restrict ptrF,
@@ -159,7 +159,7 @@ gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                       float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                       __m256 xmm1)
@@ -182,7 +182,7 @@ gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_rest
      _mm_store_ss(ptrD, t4);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                       float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                       float * gmx_restrict ptrE, float * gmx_restrict ptrF,
@@ -198,7 +198,7 @@ gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_rest
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
                                  const float * gmx_restrict p3, const float * gmx_restrict p4,
                                  __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
@@ -217,7 +217,7 @@ gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx
      *c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
                                  const float * gmx_restrict p3, const float * gmx_restrict p4,
                                  const float * gmx_restrict p5, const float * gmx_restrict p6,
@@ -234,7 +234,7 @@ gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                              const float * gmx_restrict xyz,
                                              __m256 * gmx_restrict      x1,
@@ -260,7 +260,7 @@ gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                              const float * gmx_restrict xyz,
                                              __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
@@ -308,7 +308,7 @@ gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                              const float * gmx_restrict xyz,
                                              __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
@@ -364,7 +364,7 @@ gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                       const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                       __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
@@ -382,7 +382,7 @@ gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                       const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                       __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
@@ -419,7 +419,7 @@ gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                       const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                       __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
@@ -458,7 +458,7 @@ gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                       const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                       const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
@@ -484,7 +484,7 @@ gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                       const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                       const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
@@ -546,7 +546,7 @@ gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                       const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                       const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
@@ -612,7 +612,7 @@ gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                            float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                            __m256 x1, __m256 y1, __m256 z1)
@@ -647,58 +647,8 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      gmx_mm_maskstore_ps(ptrD, mask, t8);
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                                  x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-    { \
-        __m256 _t1, _t2, _t3, _t4, _t5, _t6; \
-        __m128 _tA, _tB, _tC, _tD; \
-\
-        _t1         = _mm256_loadu_ps(ptrA); \
-        _t2         = _mm256_loadu_ps(ptrB); \
-        _t3         = _mm256_loadu_ps(ptrC); \
-        _t4         = _mm256_loadu_ps(ptrD); \
-        _tA         = _mm_load_ss(ptrA+8); \
-        _tB         = _mm_load_ss(ptrB+8); \
-        _tC         = _mm_load_ss(ptrC+8); \
-        _tD         = _mm_load_ss(ptrD+8); \
-        _t5         = _mm256_unpacklo_ps(x1, y1); \
-        x1          = _mm256_unpackhi_ps(x1, y1); \
-        y1          = _mm256_unpacklo_ps(z1, x2); \
-        z1          = _mm256_unpackhi_ps(z1, x2); \
-        x2          = _mm256_unpacklo_ps(y2, z2); \
-        y2          = _mm256_unpackhi_ps(y2, z2); \
-        _t6         = _mm256_unpacklo_ps(x3, y3); \
-        x3          = _mm256_unpackhi_ps(x3, y3); \
-        _t5         = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
-        x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
-        y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1); \
-        z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
-        z2          = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t5         = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
-        y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
-        x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t1         = _mm256_sub_ps(_t1, z2); \
-        _t2         = _mm256_sub_ps(_t2, _t5); \
-        _t3         = _mm256_sub_ps(_t3, y1); \
-        _t4         = _mm256_sub_ps(_t4, x1); \
-        _tA         = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3)); \
-        _tB         = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1))); \
-        _tC         = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2))); \
-        _tD         = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3))); \
-        _mm256_storeu_ps(ptrA, _t1); \
-        _mm256_storeu_ps(ptrB, _t2); \
-        _mm256_storeu_ps(ptrC, _t3); \
-        _mm256_storeu_ps(ptrD, _t4); \
-        _mm_store_ss(ptrA+8, _tA); \
-        _mm_store_ss(ptrB+8, _tB); \
-        _mm_store_ss(ptrC+8, _tC); \
-        _mm_store_ss(ptrD+8, _tD); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                            float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                            __m256 x1, __m256 y1, __m256 z1,
@@ -758,70 +708,9 @@ gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_store_ss(ptrC+8, tC);
      _mm_store_ss(ptrD+8, tD);
  }
-#endif
  
  
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                                  x1, y1, z1, x2, y2, z2, x3, y3, z3, x4, y4, z4) \
-    { \
-        __m256 _t1, _t2, _t3, _t4, _t5; \
-        __m128 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH; \
-\
-        _t1         = _mm256_loadu_ps(ptrA); \
-        _t2         = _mm256_loadu_ps(ptrB); \
-        _t3         = _mm256_loadu_ps(ptrC); \
-        _t4         = _mm256_loadu_ps(ptrD); \
-        _tA         = _mm_loadu_ps(ptrA+8); \
-        _tB         = _mm_loadu_ps(ptrB+8); \
-        _tC         = _mm_loadu_ps(ptrC+8); \
-        _tD         = _mm_loadu_ps(ptrD+8); \
-        _t5         = _mm256_unpacklo_ps(x1, y1); \
-        x1          = _mm256_unpackhi_ps(x1, y1); \
-        y1          = _mm256_unpacklo_ps(z1, x2); \
-        z1          = _mm256_unpackhi_ps(z1, x2); \
-        x2          = _mm256_unpacklo_ps(y2, z2); \
-        y2          = _mm256_unpackhi_ps(y2, z2); \
-        z2          = _mm256_unpacklo_ps(x3, y3); \
-        x3          = _mm256_unpackhi_ps(x3, y3); \
-        y3          = _mm256_unpacklo_ps(z3, x4); \
-        z3          = _mm256_unpackhi_ps(z3, x4); \
-        x4          = _mm256_unpacklo_ps(y4, z4); \
-        y4          = _mm256_unpackhi_ps(y4, z4); \
-        x2          = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
-        x1          = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
-        y1          = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); \
-        z1          = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
-        z2          = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t5         = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
-        y1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
-        x1          = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _tE         = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); \
-        _tF         = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); \
-        _tG         = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); \
-        _tH         = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t1         = _mm256_sub_ps(_t1, z2); \
-        _t2         = _mm256_sub_ps(_t2, _t5); \
-        _t3         = _mm256_sub_ps(_t3, y1); \
-        _t4         = _mm256_sub_ps(_t4, x1); \
-        _tA         = _mm_sub_ps(_tA, _tE); \
-        _tB         = _mm_sub_ps(_tB, _tF); \
-        _tC         = _mm_sub_ps(_tC, _tG); \
-        _tD         = _mm_sub_ps(_tD, _tH); \
-        _mm256_storeu_ps(ptrA, _t1); \
-        _mm256_storeu_ps(ptrB, _t2); \
-        _mm256_storeu_ps(ptrC, _t3); \
-        _mm256_storeu_ps(ptrD, _t4); \
-        _mm_storeu_ps(ptrA+8, _tA); \
-        _mm_storeu_ps(ptrB+8, _tB); \
-        _mm_storeu_ps(ptrC+8, _tC); \
-        _mm_storeu_ps(ptrD+8, _tD); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                            float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                            __m256 x1, __m256 y1, __m256 z1,
@@ -892,10 +781,9 @@ gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_storeu_ps(ptrC+8, tC);
      _mm_storeu_ps(ptrD+8, tD);
  }
-#endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                            float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                            float * gmx_restrict ptrE, float * gmx_restrict ptrF,
@@ -938,84 +826,7 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
  
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
-        __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-\
-        _tA         = _mm256_loadu_ps(ptrA); \
-        _tB         = _mm256_loadu_ps(ptrB); \
-        _tC         = _mm256_loadu_ps(ptrC); \
-        _tD         = _mm256_loadu_ps(ptrD); \
-        _tE         = _mm256_loadu_ps(ptrE); \
-        _tF         = _mm256_loadu_ps(ptrF); \
-        _tG         = _mm256_loadu_ps(ptrG); \
-        _tH         = _mm256_loadu_ps(ptrH); \
-        _t1         = _mm256_unpacklo_ps(_x1, _y1); \
-        _t2         = _mm256_unpackhi_ps(_x1, _y1); \
-        _t3         = _mm256_unpacklo_ps(_z1, _x2); \
-        _t4         = _mm256_unpackhi_ps(_z1, _x2); \
-        _t5         = _mm256_unpacklo_ps(_y2, _z2); \
-        _t6         = _mm256_unpackhi_ps(_y2, _z2); \
-        _t7         = _mm256_unpacklo_ps(_x3, _y3); \
-        _t8         = _mm256_unpackhi_ps(_x3, _y3); \
-        _t9         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t10        = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t11        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t12        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t1         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t2         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t3         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t4         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t5         = gmx_mm256_unpack128lo_ps(_t9, _t1); \
-        _t6         = gmx_mm256_unpack128hi_ps(_t9, _t1); \
-        _t7         = gmx_mm256_unpack128lo_ps(_t10, _t2); \
-        _t8         = gmx_mm256_unpack128hi_ps(_t10, _t2); \
-        _t1         = gmx_mm256_unpack128lo_ps(_t11, _t3); \
-        _t2         = gmx_mm256_unpack128hi_ps(_t11, _t3); \
-        _t9         = gmx_mm256_unpack128lo_ps(_t12, _t4); \
-        _t10        = gmx_mm256_unpack128hi_ps(_t12, _t4); \
-        _tA         = _mm256_sub_ps(_tA, _t5); \
-        _tB         = _mm256_sub_ps(_tB, _t7); \
-        _tC         = _mm256_sub_ps(_tC, _t1); \
-        _tD         = _mm256_sub_ps(_tD, _t9); \
-        _tE         = _mm256_sub_ps(_tE, _t6); \
-        _tF         = _mm256_sub_ps(_tF, _t8); \
-        _tG         = _mm256_sub_ps(_tG, _t2); \
-        _tH         = _mm256_sub_ps(_tH, _t10); \
-        _mm256_storeu_ps(ptrA, _tA); \
-        _mm256_storeu_ps(ptrB, _tB); \
-        _mm256_storeu_ps(ptrC, _tC); \
-        _mm256_storeu_ps(ptrD, _tD); \
-        _mm256_storeu_ps(ptrE, _tE); \
-        _mm256_storeu_ps(ptrF, _tF); \
-        _mm256_storeu_ps(ptrG, _tG); \
-        _mm256_storeu_ps(ptrH, _tH); \
-        _tI         = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8)); \
-        _tJ         = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8)); \
-        _tK         = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8)); \
-        _tL         = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8)); \
-        _tI         = _mm256_unpacklo_ps(_tI, _tK); \
-        _tJ         = _mm256_unpacklo_ps(_tJ, _tL); \
-        _tI         = _mm256_unpacklo_ps(_tI, _tJ); \
-        _tI         = _mm256_sub_ps(_tI, _z3); \
-        _tJ         = _mm256_permute_ps(_tI, _MM_SHUFFLE(1, 1, 1, 1)); \
-        _tK         = _mm256_permute_ps(_tI, _MM_SHUFFLE(2, 2, 2, 2)); \
-        _tL         = _mm256_permute_ps(_tI, _MM_SHUFFLE(3, 3, 3, 3)); \
-        _mm_store_ss(ptrA+8, _mm256_castps256_ps128(_tI)); \
-        _mm_store_ss(ptrB+8, _mm256_castps256_ps128(_tJ)); \
-        _mm_store_ss(ptrC+8, _mm256_castps256_ps128(_tK)); \
-        _mm_store_ss(ptrD+8, _mm256_castps256_ps128(_tL)); \
-        _mm_store_ss(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
-        _mm_store_ss(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
-        _mm_store_ss(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
-        _mm_store_ss(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                            float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                            float * gmx_restrict ptrE, float * gmx_restrict ptrF,
@@ -1107,94 +918,9 @@ gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_store_ss(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
      _mm_store_ss(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
  }
-#endif
-
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
-                                                  _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
-        __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-\
-        _tA         = _mm256_loadu_ps(ptrA); \
-        _tB         = _mm256_loadu_ps(ptrB); \
-        _tC         = _mm256_loadu_ps(ptrC); \
-        _tD         = _mm256_loadu_ps(ptrD); \
-        _tE         = _mm256_loadu_ps(ptrE); \
-        _tF         = _mm256_loadu_ps(ptrF); \
-        _tG         = _mm256_loadu_ps(ptrG); \
-        _tH         = _mm256_loadu_ps(ptrH); \
-        _t1         = _mm256_unpacklo_ps(_x1, _y1); \
-        _t2         = _mm256_unpackhi_ps(_x1, _y1); \
-        _t3         = _mm256_unpacklo_ps(_z1, _x2); \
-        _t4         = _mm256_unpackhi_ps(_z1, _x2); \
-        _t5         = _mm256_unpacklo_ps(_y2, _z2); \
-        _t6         = _mm256_unpackhi_ps(_y2, _z2); \
-        _t7         = _mm256_unpacklo_ps(_x3, _y3); \
-        _t8         = _mm256_unpackhi_ps(_x3, _y3); \
-        _t9         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t10        = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t11        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t12        = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t1         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t2         = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t3         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t4         = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t5         = gmx_mm256_unpack128lo_ps(_t9, _t1); \
-        _t6         = gmx_mm256_unpack128hi_ps(_t9, _t1); \
-        _t7         = gmx_mm256_unpack128lo_ps(_t10, _t2); \
-        _t8         = gmx_mm256_unpack128hi_ps(_t10, _t2); \
-        _t1         = gmx_mm256_unpack128lo_ps(_t11, _t3); \
-        _t2         = gmx_mm256_unpack128hi_ps(_t11, _t3); \
-        _t9         = gmx_mm256_unpack128lo_ps(_t12, _t4); \
-        _t10        = gmx_mm256_unpack128hi_ps(_t12, _t4); \
-        _tA         = _mm256_sub_ps(_tA, _t5); \
-        _tB         = _mm256_sub_ps(_tB, _t7); \
-        _tC         = _mm256_sub_ps(_tC, _t1); \
-        _tD         = _mm256_sub_ps(_tD, _t9); \
-        _tE         = _mm256_sub_ps(_tE, _t6); \
-        _tF         = _mm256_sub_ps(_tF, _t8); \
-        _tG         = _mm256_sub_ps(_tG, _t2); \
-        _tH         = _mm256_sub_ps(_tH, _t10); \
-        _mm256_storeu_ps(ptrA, _tA); \
-        _mm256_storeu_ps(ptrB, _tB); \
-        _mm256_storeu_ps(ptrC, _tC); \
-        _mm256_storeu_ps(ptrD, _tD); \
-        _mm256_storeu_ps(ptrE, _tE); \
-        _mm256_storeu_ps(ptrF, _tF); \
-        _mm256_storeu_ps(ptrG, _tG); \
-        _mm256_storeu_ps(ptrH, _tH); \
-        _tI         = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
-        _tJ         = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
-        _tK         = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
-        _tL         = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
-        _t1         = _mm256_unpacklo_ps(_z3, _x4); \
-        _t2         = _mm256_unpackhi_ps(_z3, _x4); \
-        _t3         = _mm256_unpacklo_ps(_y4, _z4); \
-        _t4         = _mm256_unpackhi_ps(_y4, _z4); \
-        _t5         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t6         = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _t7         = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
-        _t8         = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
-        _tI         = _mm256_sub_ps(_tI, _t5); \
-        _tJ         = _mm256_sub_ps(_tJ, _t6); \
-        _tK         = _mm256_sub_ps(_tK, _t7); \
-        _tL         = _mm256_sub_ps(_tL, _t8); \
-        _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
-        _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
-        _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
-        _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
-        _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
-        _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
-        _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
-        _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                            float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                            float * gmx_restrict ptrE, float * gmx_restrict ptrF,
@@ -1293,10 +1019,9 @@ gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
      _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
      _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
  }
-#endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
                                           float * gmx_restrict fptr,
                                           float * gmx_restrict fshiftptr)
@@ -1324,48 +1049,8 @@ gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                                 fptr, fshiftptr) \
-    { \
-        __m256 _t1, _t2, _t3; \
-        __m128 _tA, _tB, _tC; \
-\
-        fix1 = _mm256_hadd_ps(fix1, fiy1); \
-        fiz1 = _mm256_hadd_ps(fiz1, fix2); \
-        fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
-        fix3 = _mm256_hadd_ps(fix3, fiy3); \
-        fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
-        fix1 = _mm256_hadd_ps(fix1, fiz1); \
-        fiy2 = _mm256_hadd_ps(fiy2, fix3); \
-        fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
-\
-        _t1  = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
-        _t2  = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
-        _t1  = _mm256_add_ps(_t1, _t2); \
-        _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
-        _t3  = _mm256_loadu_ps(fptr); \
-        _t3  = _mm256_add_ps(_t3, _t1); \
-        _mm256_storeu_ps(fptr, _t3); \
-        _tB  = _mm_load_ss(fptr+8); \
-        _tB  = _mm_add_ss(_tB, _tA); \
-        _mm_store_ss(fptr+8, _tB); \
-\
-        _tB  = _mm256_extractf128_ps(_t1, 0x1); \
-        _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
-        _tB  = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
-        _tC  = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
-        _tB  = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
-        _tA  = _mm_add_ps(_tB, _tC); \
-        _tA  = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
-        _tC  = _mm_loadu_ps(fshiftptr); \
-        _tC  = _mm_add_ps(_tC, _tA); \
-        _mm_storeu_ps(fshiftptr, _tC); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
                                           __m256 fix2, __m256 fiy2, __m256 fiz2,
                                           __m256 fix3, __m256 fiy3, __m256 fiz3,
@@ -1414,55 +1099,9 @@ gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
      tC   = _mm_add_ps(tC, tA);
      _mm_storeu_ps(fshiftptr, tC);
  }
-#endif
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                                 fptr, fshiftptr) \
-    { \
-        __m256 _t1, _t2, _t3; \
-        __m128 _tA, _tB, _tC; \
-\
-        fix1 = _mm256_hadd_ps(fix1, fiy1); \
-        fiz1 = _mm256_hadd_ps(fiz1, fix2); \
-        fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
-        fix3 = _mm256_hadd_ps(fix3, fiy3); \
-        fiz3 = _mm256_hadd_ps(fiz3, fix4); \
-        fiy4 = _mm256_hadd_ps(fiy4, fiz4); \
-\
-        fix1 = _mm256_hadd_ps(fix1, fiz1); \
-        fiy2 = _mm256_hadd_ps(fiy2, fix3); \
-        fiz3 = _mm256_hadd_ps(fiz3, fiy4); \
-\
-        _t1  = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
-        _t2  = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
-        _t1  = _mm256_add_ps(_t1, _t2); \
-        _tA  = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
-        _t3  = _mm256_loadu_ps(fptr); \
-        _t3  = _mm256_add_ps(_t3, _t1); \
-        _mm256_storeu_ps(fptr, _t3); \
-        _tB  = _mm_loadu_ps(fptr+8); \
-        _tB  = _mm_add_ps(_tB, _tA); \
-        _mm_storeu_ps(fptr+8, _tB); \
-\
-        _tB  = _mm256_extractf128_ps(_t1, 0x1); \
-        _tC  = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
-        _tB  = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
-        _tC  = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
-        _tA  = _mm_permute_ps(_tA, _MM_SHUFFLE(0, 3, 2, 1)); \
-        _tB  = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
-        _tA  = _mm_add_ps(_tA, _tC); \
-        _tA  = _mm_add_ps(_tA, _tB); \
-        _tA  = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
-        _tC  = _mm_loadu_ps(fshiftptr); \
-        _tC  = _mm_add_ps(_tC, _tA); \
-        _mm_storeu_ps(fshiftptr, _tC); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
                                           __m256 fix2, __m256 fiy2, __m256 fiz2,
                                           __m256 fix3, __m256 fiy3, __m256 fiz3,
@@ -1516,11 +1155,9 @@ gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
      tC   = _mm_add_ps(tC, tA);
      _mm_storeu_ps(fshiftptr, tC);
  }
-#endif
-
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
  {
      __m128 t1;
@@ -1533,7 +1170,7 @@ gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
      _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
                           __m256 pot2, float * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h

index d14ab9862b85e562a429f31e6ca666f000a69e66..35c2f1e9a915b196af25f720d40c74589e3982a9 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
@@ -52,13 +52,13 @@
          row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
  }
  
-static int
+static gmx_inline int gmx_simdcall
  gmx_mm_any_lt(__m128d a, __m128d b)
  {
      return _mm_movemask_pd(_mm_cmplt_pd(a, b));
  }
  
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
  {
      return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
@@ -66,21 +66,21 @@ gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
  
  
  /* Load a double value from 1-2 places, merge into xmm register */
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
                               const double * gmx_restrict ptrB)
  {
      return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
  }
  
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
  {
      return _mm_load_sd(ptrA);
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
                                double * gmx_restrict ptrB,
                                __m128d               xmm1)
@@ -92,7 +92,7 @@ gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrB, t2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
      _mm_store_sd(ptrA, xmm1);
@@ -100,7 +100,7 @@ gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  
  
  /* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
                                    double * gmx_restrict ptrB, __m128d xmm1)
  {
@@ -113,7 +113,7 @@ gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrB, t1);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
      __m128d tmp;
@@ -124,7 +124,7 @@ gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
                               const double * gmx_restrict p2,
                               __m128d * gmx_restrict      c6,
@@ -138,7 +138,7 @@ gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
      *c12 = _mm_unpackhi_pd(t1, t2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
                               __m128d * gmx_restrict      c6,
                               __m128d * gmx_restrict      c12)
@@ -149,7 +149,7 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict      x1,
@@ -172,7 +172,7 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -210,7 +210,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -256,7 +256,7 @@ gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
  {
@@ -265,7 +265,7 @@ gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z            = _mm_load_sd(p1+2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -282,7 +282,7 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z3            = _mm_load_sd(p1+8);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -304,7 +304,7 @@ gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
                                    const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
@@ -321,7 +321,7 @@ gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -354,7 +354,7 @@ gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -396,7 +396,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  
  
  /* Routines to decrement rvec in memory, typically use for j particle force updates */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
                                           __m128d xy, __m128d z)
  {
@@ -413,7 +413,7 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
@@ -432,34 +432,8 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
  }
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _t1          = _mm_sub_pd(_t1, _x1); \
-        _t2          = _mm_sub_pd(_t2, _z1); \
-        _t3          = _mm_sub_pd(_t3, _y2); \
-        _t4          = _mm_sub_pd(_t4, _x3); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-    }
-#else
  /* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -490,36 +464,8 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+6, t4);
      _mm_store_sd(ptrA+8, t5);
  }
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _z3          = _mm_unpacklo_pd(_z3, _x4); \
-        _y4          = _mm_unpacklo_pd(_y4, _z4); \
-        _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1, _x1 )); \
-        _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2, _z1 )); \
-        _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3, _y2 )); \
-        _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4, _x3 )); \
-        _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5, _z3 )); \
-        _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -549,10 +495,9 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5, z3 ));
      _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
  }
-#endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
@@ -579,55 +524,9 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_store_sd(ptrB+2, t4);
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrB); \
-        _t7          = _mm_loadu_pd(ptrB+2); \
-        _t8          = _mm_loadu_pd(ptrB+4); \
-        _t9          = _mm_loadu_pd(ptrB+6); \
-        _t10         = _mm_load_sd(ptrB+8); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpackhi_pd(_z3, _z3); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _t6          = _mm_sub_pd(_t6, _tB); \
-        _t7          = _mm_sub_pd(_t7, _tD); \
-        _t8          = _mm_sub_pd(_t8, _tF); \
-        _t9          = _mm_sub_pd(_t9, _tH); \
-        _t10         = _mm_sub_sd(_t10, _tI); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrB, _t6); \
-        _mm_storeu_pd(ptrB+2, _t7); \
-        _mm_storeu_pd(ptrB+4, _t8); \
-        _mm_storeu_pd(ptrB+6, _t9); \
-        _mm_store_sd(ptrB+8, _t10); \
-    }
-#else
+
  /* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -680,67 +579,8 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+6, t9);
      _mm_store_sd(ptrB+8, t10);
  }
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _t7          = _mm_loadu_pd(ptrB); \
-        _t8          = _mm_loadu_pd(ptrB+2); \
-        _t9          = _mm_loadu_pd(ptrB+4); \
-        _t10         = _mm_loadu_pd(ptrB+6); \
-        _t11         = _mm_loadu_pd(ptrB+8); \
-        _t12         = _mm_loadu_pd(ptrB+10); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpacklo_pd(_z3, _x4); \
-        _tJ          = _mm_unpackhi_pd(_z3, _x4); \
-        _tK          = _mm_unpacklo_pd(_y4, _z4); \
-        _tL          = _mm_unpackhi_pd(_y4, _z4); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_pd(_t5, _tI); \
-        _t6          = _mm_sub_pd(_t6, _tK); \
-        _t7          = _mm_sub_pd(_t7, _tB); \
-        _t8          = _mm_sub_pd(_t8, _tD); \
-        _t9          = _mm_sub_pd(_t9, _tF); \
-        _t10         = _mm_sub_pd(_t10, _tH); \
-        _t11         = _mm_sub_pd(_t11, _tJ); \
-        _t12         = _mm_sub_pd(_t12, _tL); \
-        _mm_storeu_pd(ptrA,  _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_storeu_pd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrA+10, _t6); \
-        _mm_storeu_pd(ptrB,  _t7); \
-        _mm_storeu_pd(ptrB+2, _t8); \
-        _mm_storeu_pd(ptrB+4, _t9); \
-        _mm_storeu_pd(ptrB+6, _t10); \
-        _mm_storeu_pd(ptrB+8, _t11); \
-        _mm_storeu_pd(ptrB+10, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -803,13 +643,9 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+8, t11);
      _mm_storeu_pd(ptrB+10, t12);
  }
-#endif
-
-
  
  
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        double * gmx_restrict fptr,
                                        double * gmx_restrict fshiftptr)
@@ -831,40 +667,7 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
-        GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
-        GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
-        _t1  = fix3; \
-        fix3 = _mm_unpacklo_pd(fix3, fiy3); \
-        fiy3 = _mm_unpackhi_pd(_t1, fiy3); \
-        fix1 = _mm_add_pd(fix1, fiy1); \
-        fiz1 = _mm_add_pd(fiz1, fix2); \
-        fiy2 = _mm_add_pd(fiy2, fiz2); \
-        fix3 = _mm_add_pd(fix3, fiy3); \
-        fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3)); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
-        _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
-        fix1  = _mm_add_pd(fix1, fix3); \
-        _t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1  = _mm_add_pd(fix1, _t1); \
-        _t2   = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
-        fiz1  = _mm_add_sd(fiz1, fiz3); \
-        fiz1  = _mm_add_sd(fiz1, _t2); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
                                        __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -905,47 +708,9 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
-        GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
-        GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
-        GMX_MM_TRANSPOSE2_PD(fix3, fiy3); \
-        GMX_MM_TRANSPOSE2_PD(fiz3, fix4); \
-        GMX_MM_TRANSPOSE2_PD(fiy4, fiz4); \
-        fix1 = _mm_add_pd(fix1, fiy1); \
-        fiz1 = _mm_add_pd(fiz1, fix2); \
-        fiy2 = _mm_add_pd(fiy2, fiz2); \
-        fix3 = _mm_add_pd(fix3, fiy3); \
-        fiz3 = _mm_add_pd(fiz3, fix4); \
-        fiy4 = _mm_add_pd(fiy4, fiz4); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 )); \
-        _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 )); \
-        _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
-        _t1  = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1 = _mm_add_pd(fix1, _t1); \
-        _t2  = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
-        fix3 = _mm_add_pd(fix3, _t2); \
-        fix1 = _mm_add_pd(fix1, fix3); \
-        fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
-        fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
-        fiz1 = _mm_add_sd(fiz1, fiz3); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
                                        __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -990,17 +755,17 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-#endif
  
  
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
  {
      pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1, pot1));
      _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
                        __m128d pot2, double * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h

index 0820cac76e054aa25f58eb6447a8a3bf9e305e03..820a37031a09824f89de94e2517f10227b1bce03 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
@@ -47,13 +47,13 @@
  /* Normal sum of four xmm registers */
  #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
  
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
  {
      return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
  }
  
-static int
+static gmx_inline int gmx_simdcall
  gmx_mm_any_lt(__m128 a, __m128 b)
  {
      return _mm_movemask_ps(_mm_cmplt_ps(a, b));
@@ -61,7 +61,7 @@ gmx_mm_any_lt(__m128 a, __m128 b)
  
  /* Load a single value from 1-4 places, merge into xmm register */
  
-static __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
                               const float * gmx_restrict ptrB,
                               const float * gmx_restrict ptrC,
@@ -74,7 +74,7 @@ gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
      return _mm_unpacklo_ps(t1, t2);
  }
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
                                float * gmx_restrict ptrB,
                                float * gmx_restrict ptrC,
@@ -93,7 +93,7 @@ gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
  }
  
  /* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
                                    float * gmx_restrict ptrB,
                                    float * gmx_restrict ptrC,
@@ -107,7 +107,7 @@ gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                               const float * gmx_restrict p2,
                               const float * gmx_restrict p3,
@@ -134,7 +134,7 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
   */
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict      x1,
@@ -156,7 +156,7 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -194,7 +194,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -236,7 +236,7 @@ gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC,
@@ -264,7 +264,7 @@ gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC,
@@ -302,7 +302,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC,
@@ -343,7 +343,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
  }
  
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
                                         float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC,
@@ -381,73 +381,7 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
  
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                               _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
-        __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
-        _t13         = _mm_unpackhi_ps(_x1, _y1); \
-        _x1          = _mm_unpacklo_ps(_x1, _y1); \
-        _t14         = _mm_unpackhi_ps(_z1, _x2); \
-        _z1          = _mm_unpacklo_ps(_z1, _x2); \
-        _t15         = _mm_unpackhi_ps(_y2, _z2); \
-        _y2          = _mm_unpacklo_ps(_y2, _z2); \
-        _t16         = _mm_unpackhi_ps(_x3, _y3); \
-        _x3          = _mm_unpacklo_ps(_x3, _y3); \
-        _t17         = _mm_shuffle_ps(_z3, _z3, _MM_SHUFFLE(0, 0, 0, 1)); \
-        _t18         = _mm_movehl_ps(_z3, _z3); \
-        _t19         = _mm_shuffle_ps(_t18, _t18, _MM_SHUFFLE(0, 0, 0, 1)); \
-        _t20         = _mm_movelh_ps(_x1, _z1); \
-        _t21         = _mm_movehl_ps(_z1, _x1); \
-        _t22         = _mm_movelh_ps(_t13, _t14); \
-        _t14         = _mm_movehl_ps(_t14, _t13); \
-        _t23         = _mm_movelh_ps(_y2, _x3); \
-        _t24         = _mm_movehl_ps(_x3, _y2); \
-        _t25         = _mm_movelh_ps(_t15, _t16); \
-        _t16         = _mm_movehl_ps(_t16, _t15); \
-        _t1          = _mm_loadu_ps(ptrA); \
-        _t2          = _mm_loadu_ps(ptrA+4); \
-        _t3          = _mm_load_ss(ptrA+8); \
-        _t1          = _mm_sub_ps(_t1, _t20); \
-        _t2          = _mm_sub_ps(_t2, _t23); \
-        _t3          = _mm_sub_ss(_t3, _z3); \
-        _mm_storeu_ps(ptrA, _t1); \
-        _mm_storeu_ps(ptrA+4, _t2); \
-        _mm_store_ss(ptrA+8, _t3); \
-        _t4          = _mm_loadu_ps(ptrB); \
-        _t5          = _mm_loadu_ps(ptrB+4); \
-        _t6          = _mm_load_ss(ptrB+8); \
-        _t4          = _mm_sub_ps(_t4, _t21); \
-        _t5          = _mm_sub_ps(_t5, _t24); \
-        _t6          = _mm_sub_ss(_t6, _t17); \
-        _mm_storeu_ps(ptrB, _t4); \
-        _mm_storeu_ps(ptrB+4, _t5); \
-        _mm_store_ss(ptrB+8, _t6); \
-        _t7          = _mm_loadu_ps(ptrC); \
-        _t8          = _mm_loadu_ps(ptrC+4); \
-        _t9          = _mm_load_ss(ptrC+8); \
-        _t7          = _mm_sub_ps(_t7, _t22); \
-        _t8          = _mm_sub_ps(_t8, _t25); \
-        _t9          = _mm_sub_ss(_t9, _t18); \
-        _mm_storeu_ps(ptrC, _t7); \
-        _mm_storeu_ps(ptrC+4, _t8); \
-        _mm_store_ss(ptrC+8, _t9); \
-        _t10         = _mm_loadu_ps(ptrD); \
-        _t11         = _mm_loadu_ps(ptrD+4); \
-        _t12         = _mm_load_ss(ptrD+8); \
-        _t10         = _mm_sub_ps(_t10, _t14); \
-        _t11         = _mm_sub_ps(_t11, _t16); \
-        _t12         = _mm_sub_ss(_t12, _t19); \
-        _mm_storeu_ps(ptrD, _t10); \
-        _mm_storeu_ps(ptrD+4, _t11); \
-        _mm_store_ss(ptrD+8, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
@@ -514,81 +448,8 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4, t11);
      _mm_store_ss(ptrD+8, t12);
  }
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                               _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
-        __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
-        __m128 _t23, _t24; \
-        _t13         = _mm_unpackhi_ps(_x1, _y1); \
-        _x1          = _mm_unpacklo_ps(_x1, _y1); \
-        _t14         = _mm_unpackhi_ps(_z1, _x2); \
-        _z1          = _mm_unpacklo_ps(_z1, _x2); \
-        _t15         = _mm_unpackhi_ps(_y2, _z2); \
-        _y2          = _mm_unpacklo_ps(_y2, _z2); \
-        _t16         = _mm_unpackhi_ps(_x3, _y3); \
-        _x3          = _mm_unpacklo_ps(_x3, _y3); \
-        _t17         = _mm_unpackhi_ps(_z3, _x4); \
-        _z3          = _mm_unpacklo_ps(_z3, _x4); \
-        _t18         = _mm_unpackhi_ps(_y4, _z4); \
-        _y4          = _mm_unpacklo_ps(_y4, _z4); \
-        _t19         = _mm_movelh_ps(_x1, _z1); \
-        _z1          = _mm_movehl_ps(_z1, _x1); \
-        _t20         = _mm_movelh_ps(_t13, _t14); \
-        _t14         = _mm_movehl_ps(_t14, _t13); \
-        _t21         = _mm_movelh_ps(_y2, _x3); \
-        _x3          = _mm_movehl_ps(_x3, _y2); \
-        _t22         = _mm_movelh_ps(_t15, _t16); \
-        _t16         = _mm_movehl_ps(_t16, _t15); \
-        _t23         = _mm_movelh_ps(_z3, _y4); \
-        _y4          = _mm_movehl_ps(_y4, _z3); \
-        _t24         = _mm_movelh_ps(_t17, _t18); \
-        _t18         = _mm_movehl_ps(_t18, _t17); \
-        _t1          = _mm_loadu_ps(ptrA); \
-        _t2          = _mm_loadu_ps(ptrA+4); \
-        _t3          = _mm_loadu_ps(ptrA+8); \
-        _t1          = _mm_sub_ps(_t1, _t19); \
-        _t2          = _mm_sub_ps(_t2, _t21); \
-        _t3          = _mm_sub_ps(_t3, _t23); \
-        _mm_storeu_ps(ptrA, _t1); \
-        _mm_storeu_ps(ptrA+4, _t2); \
-        _mm_storeu_ps(ptrA+8, _t3); \
-        _t4          = _mm_loadu_ps(ptrB); \
-        _t5          = _mm_loadu_ps(ptrB+4); \
-        _t6          = _mm_loadu_ps(ptrB+8); \
-        _t4          = _mm_sub_ps(_t4, _z1); \
-        _t5          = _mm_sub_ps(_t5, _x3); \
-        _t6          = _mm_sub_ps(_t6, _y4); \
-        _mm_storeu_ps(ptrB, _t4); \
-        _mm_storeu_ps(ptrB+4, _t5); \
-        _mm_storeu_ps(ptrB+8, _t6); \
-        _t7          = _mm_loadu_ps(ptrC); \
-        _t8          = _mm_loadu_ps(ptrC+4); \
-        _t9          = _mm_loadu_ps(ptrC+8); \
-        _t7          = _mm_sub_ps(_t7, _t20); \
-        _t8          = _mm_sub_ps(_t8, _t22); \
-        _t9          = _mm_sub_ps(_t9, _t24); \
-        _mm_storeu_ps(ptrC, _t7); \
-        _mm_storeu_ps(ptrC+4, _t8); \
-        _mm_storeu_ps(ptrC+8, _t9); \
-        _t10         = _mm_loadu_ps(ptrD); \
-        _t11         = _mm_loadu_ps(ptrD+4); \
-        _t12         = _mm_loadu_ps(ptrD+8); \
-        _t10         = _mm_sub_ps(_t10, _t14); \
-        _t11         = _mm_sub_ps(_t11, _t16); \
-        _t12         = _mm_sub_ps(_t12, _t18); \
-        _mm_storeu_ps(ptrD, _t10); \
-        _mm_storeu_ps(ptrD+4, _t11); \
-        _mm_storeu_ps(ptrD+8, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
@@ -660,10 +521,9 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4, t11);
      _mm_storeu_ps(ptrD+8, t12);
  }
-#endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        float * gmx_restrict fptr,
                                        float * gmx_restrict fshiftptr)
@@ -689,39 +549,7 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128 _t1, _t2, _t3, _t4; \
-\
-        _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
-        _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
-        _t2   = _mm_movehl_ps(_mm_setzero_ps(), fiz3); \
-        _t1   = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(0, 0, 0, 1)); \
-        _t3   = _mm_shuffle_ps(_t2, _t2, _MM_SHUFFLE(0, 0, 0, 1)); \
-        fix1  = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
-        fiy2  = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
-        fiz3  = _mm_add_ss(_mm_add_ps(fiz3, _t1), _mm_add_ps(_t2, _t3)); \
-        _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
-        _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
-        _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
-        _t4 = _mm_load_ss(fshiftptr+2); \
-        _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
-        _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
-        _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
-        _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
-        _t3 = _mm_shuffle_ps(_t3, _t3, _MM_SHUFFLE(1, 2, 0, 0)); \
-        _t1 = _mm_add_ps(_t1, _t2); \
-        _t3 = _mm_add_ps(_t3, _t4); \
-        _t1 = _mm_add_ps(_t1, _t3); \
-        _mm_store_ss(fshiftptr+2, _t1); \
-        _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
                                        __m128 fix3, __m128 fiy3, __m128 fiz3,
@@ -760,40 +588,8 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2, t1);
      _mm_storeh_pi((__m64 *)(fshiftptr), t1);
  }
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5; \
-        _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
-        _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
-        _MM_TRANSPOSE4_PS(fiz3, fix4, fiy4, fiz4); \
-        fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
-        fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
-        fiz3 = _mm_add_ps(_mm_add_ps(fiz3, fix4), _mm_add_ps(fiy4, fiz4)); \
-        _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
-        _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
-        _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
-        _t5 = _mm_load_ss(fshiftptr+2); \
-        _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
-        _t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
-        _t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
-        _t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
-        _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
-        _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
-        _t1 = _mm_add_ps(_t1, _t2); \
-        _t3 = _mm_add_ps(_t3, _t4); \
-        _t1 = _mm_add_ps(_t1, _t3); \
-        _t5 = _mm_add_ps(_t5, _t1); \
-        _mm_store_ss(fshiftptr+2, _t5); \
-        _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
                                        __m128 fix3, __m128 fiy3, __m128 fiz3,
@@ -833,10 +629,10 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2, t5);
      _mm_storeh_pi((__m64 *)(fshiftptr), t5);
  }
-#endif
  
  
-static void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
  {
      pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
@@ -844,7 +640,7 @@ gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
      _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
  }
  
-static void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
                        __m128 pot2, float * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h

index 9f759f310dde39d01f934c538dfe57834a63caf1..ccbb62750fad77879f722578b729f75610ec8935 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
@@ -51,13 +51,13 @@
  /* Normal sum of four ymm registers */
  #define gmx_mm_sum4_pd(t0, t1, t2, t3)  _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
  
-static int
+static gmx_inline int gmx_simdcall
  gmx_mm_any_lt(__m128d a, __m128d b)
  {
      return _mm_movemask_pd(_mm_cmplt_pd(a, b));
  }
  
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
  {
      return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
@@ -65,21 +65,21 @@ gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
  
  
  /* Load a double value from 1-2 places, merge into xmm register */
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
                               const double * gmx_restrict ptrB)
  {
      return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
  }
  
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
  {
      return _mm_load_sd(ptrA);
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
                                double * gmx_restrict ptrB,
                                __m128d               xmm1)
@@ -91,7 +91,7 @@ gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrB, t2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
      _mm_store_sd(ptrA, xmm1);
@@ -99,7 +99,7 @@ gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  
  
  /* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
                                    double * gmx_restrict ptrB, __m128d xmm1)
  {
@@ -112,7 +112,7 @@ gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
      _mm_store_sd(ptrB, t1);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  {
      __m128d tmp;
@@ -123,7 +123,7 @@ gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
                               const double * gmx_restrict p2,
                               __m128d * gmx_restrict      c6,
@@ -137,7 +137,7 @@ gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
      *c12 = _mm_unpackhi_pd(t1, t2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
                               __m128d * gmx_restrict      c6,
                               __m128d * gmx_restrict      c12)
@@ -148,7 +148,7 @@ gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict      x1,
@@ -171,7 +171,7 @@ gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -209,7 +209,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
                                           const double * gmx_restrict xyz,
                                           __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
@@ -255,7 +255,7 @@ gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
  {
@@ -264,7 +264,7 @@ gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z            = _mm_load_sd(p1+2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -281,7 +281,7 @@ gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
      *z3            = _mm_load_sd(p1+8);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -303,7 +303,7 @@ gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
                                    const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
@@ -320,7 +320,7 @@ gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -353,7 +353,7 @@ gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
                                    __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
                                    __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
@@ -395,7 +395,7 @@ gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double
  
  
  /* Routines to decrement rvec in memory, typically use for j particle force updates */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
                                           __m128d xy, __m128d z)
  {
@@ -412,7 +412,7 @@ gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
@@ -431,34 +431,7 @@ gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
  }
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _t1          = _mm_sub_pd(_t1, _x1); \
-        _t2          = _mm_sub_pd(_t2, _z1); \
-        _t3          = _mm_sub_pd(_t3, _y2); \
-        _t4          = _mm_sub_pd(_t4, _x3); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -489,36 +462,9 @@ gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+6, t4);
      _mm_store_sd(ptrA+8, t5);
  }
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _x1          = _mm_unpacklo_pd(_x1, _y1); \
-        _z1          = _mm_unpacklo_pd(_z1, _x2); \
-        _y2          = _mm_unpacklo_pd(_y2, _z2); \
-        _x3          = _mm_unpacklo_pd(_x3, _y3); \
-        _z3          = _mm_unpacklo_pd(_z3, _x4); \
-        _y4          = _mm_unpacklo_pd(_y4, _z4); \
-        _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1, _x1 )); \
-        _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2, _z1 )); \
-        _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3, _y2 )); \
-        _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4, _x3 )); \
-        _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5, _z3 )); \
-        _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -548,10 +494,9 @@ gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
      _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5, z3 ));
      _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
  }
-#endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1)
  {
@@ -578,55 +523,8 @@ gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_store_sd(ptrB+2, t4);
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_load_sd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrB); \
-        _t7          = _mm_loadu_pd(ptrB+2); \
-        _t8          = _mm_loadu_pd(ptrB+4); \
-        _t9          = _mm_loadu_pd(ptrB+6); \
-        _t10         = _mm_load_sd(ptrB+8); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpackhi_pd(_z3, _z3); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_sd(_t5, _z3); \
-        _t6          = _mm_sub_pd(_t6, _tB); \
-        _t7          = _mm_sub_pd(_t7, _tD); \
-        _t8          = _mm_sub_pd(_t8, _tF); \
-        _t9          = _mm_sub_pd(_t9, _tH); \
-        _t10         = _mm_sub_sd(_t10, _tI); \
-        _mm_storeu_pd(ptrA, _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_store_sd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrB, _t6); \
-        _mm_storeu_pd(ptrB+2, _t7); \
-        _mm_storeu_pd(ptrB+4, _t8); \
-        _mm_storeu_pd(ptrB+6, _t9); \
-        _mm_store_sd(ptrB+8, _t10); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -679,67 +577,9 @@ gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+6, t9);
      _mm_store_sd(ptrB+8, t10);
  }
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
-        __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-        _t1          = _mm_loadu_pd(ptrA); \
-        _t2          = _mm_loadu_pd(ptrA+2); \
-        _t3          = _mm_loadu_pd(ptrA+4); \
-        _t4          = _mm_loadu_pd(ptrA+6); \
-        _t5          = _mm_loadu_pd(ptrA+8); \
-        _t6          = _mm_loadu_pd(ptrA+10); \
-        _t7          = _mm_loadu_pd(ptrB); \
-        _t8          = _mm_loadu_pd(ptrB+2); \
-        _t9          = _mm_loadu_pd(ptrB+4); \
-        _t10         = _mm_loadu_pd(ptrB+6); \
-        _t11         = _mm_loadu_pd(ptrB+8); \
-        _t12         = _mm_loadu_pd(ptrB+10); \
-        _tA          = _mm_unpacklo_pd(_x1, _y1); \
-        _tB          = _mm_unpackhi_pd(_x1, _y1); \
-        _tC          = _mm_unpacklo_pd(_z1, _x2); \
-        _tD          = _mm_unpackhi_pd(_z1, _x2); \
-        _tE          = _mm_unpacklo_pd(_y2, _z2); \
-        _tF          = _mm_unpackhi_pd(_y2, _z2); \
-        _tG          = _mm_unpacklo_pd(_x3, _y3); \
-        _tH          = _mm_unpackhi_pd(_x3, _y3); \
-        _tI          = _mm_unpacklo_pd(_z3, _x4); \
-        _tJ          = _mm_unpackhi_pd(_z3, _x4); \
-        _tK          = _mm_unpacklo_pd(_y4, _z4); \
-        _tL          = _mm_unpackhi_pd(_y4, _z4); \
-        _t1          = _mm_sub_pd(_t1, _tA); \
-        _t2          = _mm_sub_pd(_t2, _tC); \
-        _t3          = _mm_sub_pd(_t3, _tE); \
-        _t4          = _mm_sub_pd(_t4, _tG); \
-        _t5          = _mm_sub_pd(_t5, _tI); \
-        _t6          = _mm_sub_pd(_t6, _tK); \
-        _t7          = _mm_sub_pd(_t7, _tB); \
-        _t8          = _mm_sub_pd(_t8, _tD); \
-        _t9          = _mm_sub_pd(_t9, _tF); \
-        _t10         = _mm_sub_pd(_t10, _tH); \
-        _t11         = _mm_sub_pd(_t11, _tJ); \
-        _t12         = _mm_sub_pd(_t12, _tL); \
-        _mm_storeu_pd(ptrA,  _t1); \
-        _mm_storeu_pd(ptrA+2, _t2); \
-        _mm_storeu_pd(ptrA+4, _t3); \
-        _mm_storeu_pd(ptrA+6, _t4); \
-        _mm_storeu_pd(ptrA+8, _t5); \
-        _mm_storeu_pd(ptrA+10, _t6); \
-        _mm_storeu_pd(ptrB,  _t7); \
-        _mm_storeu_pd(ptrB+2, _t8); \
-        _mm_storeu_pd(ptrB+4, _t9); \
-        _mm_storeu_pd(ptrB+6, _t10); \
-        _mm_storeu_pd(ptrB+8, _t11); \
-        _mm_storeu_pd(ptrB+10, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
                                         __m128d x1, __m128d y1, __m128d z1,
                                         __m128d x2, __m128d y2, __m128d z2,
@@ -802,12 +642,9 @@ gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_
      _mm_storeu_pd(ptrB+8, t11);
      _mm_storeu_pd(ptrB+10, t12);
  }
-#endif
-
  
  
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        double * gmx_restrict fptr,
                                        double * gmx_restrict fshiftptr)
@@ -823,34 +660,8 @@ gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
  }
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        fix1 = _mm_hadd_pd(fix1, fiy1); \
-        fiz1 = _mm_hadd_pd(fiz1, fix2); \
-        fiy2 = _mm_hadd_pd(fiy2, fiz2); \
-        fix3 = _mm_hadd_pd(fix3, fiy3); \
-        fiz3 = _mm_hadd_pd(fiz3, fiz3); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
-        _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
-        fix1  = _mm_add_pd(fix1, fix3); \
-        _t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1  = _mm_add_pd(fix1, _t1); \
-        _t2   = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
-        fiz1  = _mm_add_sd(fiz1, fiz3); \
-        fiz1  = _mm_add_sd(fiz1, _t2); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
                                        __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -882,40 +693,9 @@ gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128d _t1, _t2; \
-        fix1 = _mm_hadd_pd(fix1, fiy1); \
-        fiz1 = _mm_hadd_pd(fiz1, fix2); \
-        fiy2 = _mm_hadd_pd(fiy2, fiz2); \
-        fix3 = _mm_hadd_pd(fix3, fiy3); \
-        fiz3 = _mm_hadd_pd(fiz3, fix4); \
-        fiy4 = _mm_hadd_pd(fiy4, fiz4); \
-        _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 )); \
-        _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 )); \
-        _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 )); \
-        _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 )); \
-        _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 )); \
-        _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
-        _t1  = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
-        fix1 = _mm_add_pd(fix1, _t1); \
-        _t2  = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
-        fix3 = _mm_add_pd(fix3, _t2); \
-        fix1 = _mm_add_pd(fix1, fix3); \
-        fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
-        fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
-        fiz1 = _mm_add_sd(fiz1, fiz3); \
-        _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
-        _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
                                        __m128d fix2, __m128d fiy2, __m128d fiz2,
                                        __m128d fix3, __m128d fiy3, __m128d fiz3,
@@ -952,16 +732,16 @@ gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
      _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
      _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
  }
-#endif
  
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
  {
      pot1 = _mm_hadd_pd(pot1, pot1);
      _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
                        __m128d pot2, double * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h

index 8d8c3e671972aa2243af3fd4ee63810930b48cf0..45f2a7b5f250852048835f8d278d10d1326ecf0a 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
@@ -46,13 +46,13 @@
  /* Normal sum of four xmm registers */
  #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
  
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
  {
      return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
  }
  
-static gmx_inline int
+static gmx_inline int gmx_simdcall
  gmx_mm_any_lt(__m128 a, __m128 b)
  {
      return _mm_movemask_ps(_mm_cmplt_ps(a, b));
@@ -60,7 +60,7 @@ gmx_mm_any_lt(__m128 a, __m128 b)
  
  /* Load a single value from 1-4 places, merge into xmm register */
  
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
                               const float * gmx_restrict ptrB,
                               const float * gmx_restrict ptrC,
@@ -73,7 +73,7 @@ gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
      return _mm_unpacklo_ps(t1, t2);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
                                float * gmx_restrict ptrB,
                                float * gmx_restrict ptrC,
@@ -92,7 +92,7 @@ gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
  }
  
  /* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
                                    float * gmx_restrict ptrB,
                                    float * gmx_restrict ptrC,
@@ -106,7 +106,7 @@ gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                               const float * gmx_restrict p2,
                               const float * gmx_restrict p3,
@@ -127,7 +127,7 @@ gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict      x1,
@@ -149,7 +149,7 @@ gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -187,7 +187,7 @@ gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                           const float * gmx_restrict xyz,
                                           __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
@@ -229,7 +229,7 @@ gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC,
@@ -257,7 +257,7 @@ gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC,
@@ -295,7 +295,7 @@ gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
  }
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
                                    const float * gmx_restrict ptrB,
                                    const float * gmx_restrict ptrC,
@@ -337,7 +337,7 @@ gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
  
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
                                         float * ptrB,
                                         float * ptrC,
@@ -375,73 +375,7 @@ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
  
  
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                               _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
-        __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
-        __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
-        _t13         = _mm_unpackhi_ps(_x1, _y1); \
-        _x1          = _mm_unpacklo_ps(_x1, _y1); \
-        _t14         = _mm_unpackhi_ps(_z1, _x2); \
-        _z1          = _mm_unpacklo_ps(_z1, _x2); \
-        _t15         = _mm_unpackhi_ps(_y2, _z2); \
-        _y2          = _mm_unpacklo_ps(_y2, _z2); \
-        _t16         = _mm_unpackhi_ps(_x3, _y3); \
-        _x3          = _mm_unpacklo_ps(_x3, _y3); \
-        _t17         = _mm_shuffle_ps(_z3, _z3, _MM_SHUFFLE(0, 0, 0, 1)); \
-        _t18         = _mm_movehl_ps(_z3, _z3); \
-        _t19         = _mm_shuffle_ps(_t18, _t18, _MM_SHUFFLE(0, 0, 0, 1)); \
-        _t20         = _mm_movelh_ps(_x1, _z1); \
-        _t21         = _mm_movehl_ps(_z1, _x1); \
-        _t22         = _mm_movelh_ps(_t13, _t14); \
-        _t14         = _mm_movehl_ps(_t14, _t13); \
-        _t23         = _mm_movelh_ps(_y2, _x3); \
-        _t24         = _mm_movehl_ps(_x3, _y2); \
-        _t25         = _mm_movelh_ps(_t15, _t16); \
-        _t16         = _mm_movehl_ps(_t16, _t15); \
-        _t1          = _mm_loadu_ps(ptrA); \
-        _t2          = _mm_loadu_ps(ptrA+4); \
-        _t3          = _mm_load_ss(ptrA+8); \
-        _t1          = _mm_sub_ps(_t1, _t20); \
-        _t2          = _mm_sub_ps(_t2, _t23); \
-        _t3          = _mm_sub_ss(_t3, _z3); \
-        _mm_storeu_ps(ptrA, _t1); \
-        _mm_storeu_ps(ptrA+4, _t2); \
-        _mm_store_ss(ptrA+8, _t3); \
-        _t4          = _mm_loadu_ps(ptrB); \
-        _t5          = _mm_loadu_ps(ptrB+4); \
-        _t6          = _mm_load_ss(ptrB+8); \
-        _t4          = _mm_sub_ps(_t4, _t21); \
-        _t5          = _mm_sub_ps(_t5, _t24); \
-        _t6          = _mm_sub_ss(_t6, _t17); \
-        _mm_storeu_ps(ptrB, _t4); \
-        _mm_storeu_ps(ptrB+4, _t5); \
-        _mm_store_ss(ptrB+8, _t6); \
-        _t7          = _mm_loadu_ps(ptrC); \
-        _t8          = _mm_loadu_ps(ptrC+4); \
-        _t9          = _mm_load_ss(ptrC+8); \
-        _t7          = _mm_sub_ps(_t7, _t22); \
-        _t8          = _mm_sub_ps(_t8, _t25); \
-        _t9          = _mm_sub_ss(_t9, _t18); \
-        _mm_storeu_ps(ptrC, _t7); \
-        _mm_storeu_ps(ptrC+4, _t8); \
-        _mm_store_ss(ptrC+8, _t9); \
-        _t10         = _mm_loadu_ps(ptrD); \
-        _t11         = _mm_loadu_ps(ptrD+4); \
-        _t12         = _mm_load_ss(ptrD+8); \
-        _t10         = _mm_sub_ps(_t10, _t14); \
-        _t11         = _mm_sub_ps(_t11, _t16); \
-        _t12         = _mm_sub_ss(_t12, _t19); \
-        _mm_storeu_ps(ptrD, _t10); \
-        _mm_storeu_ps(ptrD+4, _t11); \
-        _mm_store_ss(ptrD+8, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
@@ -509,80 +443,9 @@ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4, t11);
      _mm_store_ss(ptrD+8, t12);
  }
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
-                                               _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
-        __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
-        __m128 _t23, _t24; \
-        _t13         = _mm_unpackhi_ps(_x1, _y1); \
-        _x1          = _mm_unpacklo_ps(_x1, _y1); \
-        _t14         = _mm_unpackhi_ps(_z1, _x2); \
-        _z1          = _mm_unpacklo_ps(_z1, _x2); \
-        _t15         = _mm_unpackhi_ps(_y2, _z2); \
-        _y2          = _mm_unpacklo_ps(_y2, _z2); \
-        _t16         = _mm_unpackhi_ps(_x3, _y3); \
-        _x3          = _mm_unpacklo_ps(_x3, _y3); \
-        _t17         = _mm_unpackhi_ps(_z3, _x4); \
-        _z3          = _mm_unpacklo_ps(_z3, _x4); \
-        _t18         = _mm_unpackhi_ps(_y4, _z4); \
-        _y4          = _mm_unpacklo_ps(_y4, _z4); \
-        _t19         = _mm_movelh_ps(_x1, _z1); \
-        _z1          = _mm_movehl_ps(_z1, _x1); \
-        _t20         = _mm_movelh_ps(_t13, _t14); \
-        _t14         = _mm_movehl_ps(_t14, _t13); \
-        _t21         = _mm_movelh_ps(_y2, _x3); \
-        _x3          = _mm_movehl_ps(_x3, _y2); \
-        _t22         = _mm_movelh_ps(_t15, _t16); \
-        _t16         = _mm_movehl_ps(_t16, _t15); \
-        _t23         = _mm_movelh_ps(_z3, _y4); \
-        _y4          = _mm_movehl_ps(_y4, _z3); \
-        _t24         = _mm_movelh_ps(_t17, _t18); \
-        _t18         = _mm_movehl_ps(_t18, _t17); \
-        _t1          = _mm_loadu_ps(ptrA); \
-        _t2          = _mm_loadu_ps(ptrA+4); \
-        _t3          = _mm_loadu_ps(ptrA+8); \
-        _t1          = _mm_sub_ps(_t1, _t19); \
-        _t2          = _mm_sub_ps(_t2, _t21); \
-        _t3          = _mm_sub_ps(_t3, _t23); \
-        _mm_storeu_ps(ptrA, _t1); \
-        _mm_storeu_ps(ptrA+4, _t2); \
-        _mm_storeu_ps(ptrA+8, _t3); \
-        _t4          = _mm_loadu_ps(ptrB); \
-        _t5          = _mm_loadu_ps(ptrB+4); \
-        _t6          = _mm_loadu_ps(ptrB+8); \
-        _t4          = _mm_sub_ps(_t4, _z1); \
-        _t5          = _mm_sub_ps(_t5, _x3); \
-        _t6          = _mm_sub_ps(_t6, _y4); \
-        _mm_storeu_ps(ptrB, _t4); \
-        _mm_storeu_ps(ptrB+4, _t5); \
-        _mm_storeu_ps(ptrB+8, _t6); \
-        _t7          = _mm_loadu_ps(ptrC); \
-        _t8          = _mm_loadu_ps(ptrC+4); \
-        _t9          = _mm_loadu_ps(ptrC+8); \
-        _t7          = _mm_sub_ps(_t7, _t20); \
-        _t8          = _mm_sub_ps(_t8, _t22); \
-        _t9          = _mm_sub_ps(_t9, _t24); \
-        _mm_storeu_ps(ptrC, _t7); \
-        _mm_storeu_ps(ptrC+4, _t8); \
-        _mm_storeu_ps(ptrC+8, _t9); \
-        _t10         = _mm_loadu_ps(ptrD); \
-        _t11         = _mm_loadu_ps(ptrD+4); \
-        _t12         = _mm_loadu_ps(ptrD+8); \
-        _t10         = _mm_sub_ps(_t10, _t14); \
-        _t11         = _mm_sub_ps(_t11, _t16); \
-        _t12         = _mm_sub_ps(_t12, _t18); \
-        _mm_storeu_ps(ptrD, _t10); \
-        _mm_storeu_ps(ptrD+4, _t11); \
-        _mm_storeu_ps(ptrD+8, _t12); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                         float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                         __m128 x1, __m128 y1, __m128 z1,
@@ -654,10 +517,9 @@ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_re
      _mm_storeu_ps(ptrD+4, t11);
      _mm_storeu_ps(ptrD+8, t12);
  }
-#endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        float * gmx_restrict fptr,
                                        float * gmx_restrict fshiftptr)
@@ -683,39 +545,8 @@ gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
  }
  
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128 _t1, _t2, _t3, _t4; \
-\
-        fix1 = _mm_hadd_ps(fix1, fiy1); \
-        fiz1 = _mm_hadd_ps(fiz1, fix2); \
-        fiy2 = _mm_hadd_ps(fiy2, fiz2); \
-        fix3 = _mm_hadd_ps(fix3, fiy3); \
-        fiz3 = _mm_hadd_ps(fiz3, fiz3); \
-        fix1 = _mm_hadd_ps(fix1, fiz1); \
-        fiy2 = _mm_hadd_ps(fiy2, fix3); \
-        fiz3 = _mm_hadd_ps(fiz3, fiz3); \
-        _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
-        _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
-        _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
-        _t4 = _mm_load_ss(fshiftptr+2); \
-        _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
-        _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
-        _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
-        _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
-        _t3 = _mm_shuffle_ps(_t3, _t3, _MM_SHUFFLE(1, 2, 0, 0)); \
-        _t1 = _mm_add_ps(_t1, _t2); \
-        _t3 = _mm_add_ps(_t3, _t4); \
-        _t1 = _mm_add_ps(_t1, _t3); \
-        _mm_store_ss(fshiftptr+2, _t1); \
-        _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
                                        __m128 fix3, __m128 fiy3, __m128 fiz3,
@@ -753,44 +584,9 @@ gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2, t1);
      _mm_storeh_pi((__m64 *)(fshiftptr), t1);
  }
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
-                                              fptr, fshiftptr) \
-    { \
-        __m128 _t1, _t2, _t3, _t4, _t5; \
-\
-        fix1 = _mm_hadd_ps(fix1, fiy1); \
-        fiz1 = _mm_hadd_ps(fiz1, fix2); \
-        fiy2 = _mm_hadd_ps(fiy2, fiz2); \
-        fix3 = _mm_hadd_ps(fix3, fiy3); \
-        fiz3 = _mm_hadd_ps(fiz3, fix4); \
-        fiy4 = _mm_hadd_ps(fiy4, fiz4); \
-        fix1 = _mm_hadd_ps(fix1, fiz1); \
-        fiy2 = _mm_hadd_ps(fiy2, fix3); \
-        fiz3 = _mm_hadd_ps(fiz3, fiy4); \
-        _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
-        _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
-        _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
-        _t5 = _mm_load_ss(fshiftptr+2); \
-        _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
-        _t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
-        _t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
-        _t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
-        _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
-        _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
-        _t1 = _mm_add_ps(_t1, _t2); \
-        _t3 = _mm_add_ps(_t3, _t4); \
-        _t1 = _mm_add_ps(_t1, _t3); \
-        _t5 = _mm_add_ps(_t5, _t1); \
-        _mm_store_ss(fshiftptr+2, _t5); \
-        _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
-    }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
  gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                        __m128 fix2, __m128 fiy2, __m128 fiz2,
                                        __m128 fix3, __m128 fiy3, __m128 fiz3,
@@ -832,10 +628,9 @@ gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
      _mm_store_ss(fshiftptr+2, t5);
      _mm_storeh_pi((__m64 *)(fshiftptr), t5);
  }
-#endif
  
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
  {
      pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
@@ -843,7 +638,7 @@ gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
      _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
                        __m128 pot2, float * gmx_restrict ptrB)
  {
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h

index b1d08c1ae6bed003a70c89e9433e34eb617d29d1..c90e4bb8030211fbdc8ca45a4cd15787029c0158 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
@@ -49,7 +49,7 @@ typedef gmx_simd_int32_t gmx_exclfilter;
  static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  
  /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
                                 __m128 *out0, __m128 *out1)
  {
@@ -62,7 +62,7 @@ gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
  }
  
  /* Collect element 2 of the 4 inputs to out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
  {
      __m128 _c01, _c23;
@@ -74,7 +74,7 @@ gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
  }
  
  /* Sum the elements within each input register and store the sums in out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_transpose_sum4_pr(__m128 in0, __m128 in1,
                           __m128 in2, __m128 in3)
  {
@@ -115,7 +115,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj,
   * prepare_table_load_buffer(), but it is only used with full-width
   * AVX_256. */
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
               __m128 *ctab0_S, __m128 *ctab1_S)
  {
@@ -138,7 +138,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *t
      gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3], ctab0_S, ctab1_S);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
                 __m128 *ctab0_S, __m128 *ctab1_S, __m128 *ctabv_S)
  {
@@ -163,19 +163,19 @@ load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused
      *ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3]);
  }
  
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
  gmx_load1_exclfilter(int e)
  {
      return _mm_set1_epi32(e);
  }
  
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
  gmx_load_exclusion_filter(const unsigned *i)
  {
      return gmx_simd_load_i(i);
  }
  
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
      return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h

index 9a9aba1f95d57b2626d8db63136ba3f4940af1a9..51dd883ed2c10a46fb5f7dce8c5c9d91060959fb 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
@@ -63,7 +63,7 @@
  #define gmx_sub_hpr                  _mm_sub_ps
  
  /* Sum over 4 half SIMD registers */
-static __m128 gmx_sum4_hpr(__m256 x, __m256 y)
+static __m128 gmx_simdcall gmx_sum4_hpr(__m256 x, __m256 y)
  {
      __m256 sum;
  
@@ -80,7 +80,7 @@ gmx_loaddh_pr(gmx_simd_real_t *a, const real *b)
      *a  = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 0x1);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
  {
      *b = _mm256_extractf128_ps(a, 0);
@@ -88,7 +88,7 @@ gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
  }
  
  /* Store half width SIMD registers a and b in full width register *c */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
  {
      *c = _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 0x1);
@@ -97,7 +97,7 @@ gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
  #endif /* GMX_NBNXN_SIMD_2XNN */
  
  /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
                                 __m128 *out0, __m128 *out1)
  {
@@ -110,7 +110,7 @@ gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
  }
  
  /* Collect element 2 of the 4 inputs to out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
  {
      __m128 _c01, _c23;
@@ -122,7 +122,7 @@ gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
  }
  
  /* Sum the elements within each input register and return the sums */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_transpose_sum4_pr(__m256 in0, __m256 in1,
                           __m256 in2, __m256 in3)
  {
@@ -135,7 +135,7 @@ gmx_mm_transpose_sum4_pr(__m256 in0, __m256 in1,
  }
  
  /* Sum the elements of halfs of each input register and return the sums */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_mm_transpose_sum4h_pr(__m256 in0, __m256 in2)
  {
      in0 = _mm256_hadd_ps(in0, _mm256_setzero_ps());
@@ -147,7 +147,7 @@ gmx_mm_transpose_sum4h_pr(__m256 in0, __m256 in2)
  }
  
  /* Put two 128-bit 4-float registers into one 256-bit 8-float register */
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_2_mm_to_m256(__m128 in0, __m128 in1)
  {
      return _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1);
@@ -221,7 +221,7 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
   * prepare_table_load_buffer(), but it is only used with full-width
   * AVX_256. */
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
               __m256 *ctab0_S, __m256 *ctab1_S)
  {
@@ -243,7 +243,7 @@ load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
      *ctab1_S = gmx_2_mm_to_m256(ctabt_S[2], ctabt_S[3]);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
                 __m256 *ctab0_S, __m256 *ctab1_S, __m256 *ctabv_S)
  {
@@ -277,19 +277,19 @@ load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
  typedef gmx_simd_int32_t gmx_exclfilter;
  static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
  gmx_load1_exclfilter(int e)
  {
      return _mm256_set1_epi32(e);
  }
  
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
  gmx_load_exclusion_filter(const unsigned *i)
  {
      return gmx_simd_load_i(i);
  }
  
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
      return _mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_andnot_si256(m0, m1), _mm256_setzero_si256()));
@@ -301,19 +301,19 @@ gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  typedef gmx_simd_real_t gmx_exclfilter;
  static const int filter_stride = 1;
  
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
  gmx_load1_exclfilter(int e)
  {
      return _mm256_castsi256_ps(_mm256_set1_epi32(e));
  }
  
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
  gmx_load_exclusion_filter(const unsigned *i)
  {
      return gmx_simd_load_r((real *) (i));
  }
  
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
      return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h

index 01d8090765b2c4eb0d01772b05310f1b21c8b81f..698c5d389c6827a21184395ba5e1cd3fcd86a92d 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
@@ -52,7 +52,7 @@
  
  #include "../nbnxn_kernel_simd_utils.h"
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_load_simd_2xnn_interactions(int                  excl,
                                  gmx_exclfilter       filter_S0,
                                  gmx_exclfilter       filter_S2,
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h

index 4f9626f4bafdc7274a04a88b255f4947f755db33..1940d1a3da59269551818bb711c35331bb4ceb3e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
@@ -56,7 +56,7 @@
  
  #include "../nbnxn_kernel_simd_utils.h"
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_load_simd_4xn_interactions(int gmx_unused             excl,
                                 gmx_exclfilter gmx_unused  filter_S0,
                                 gmx_exclfilter gmx_unused  filter_S1,
diff --git a/src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h b/src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h

index ec2e858c113f4bd630112087d83e09ff719045a9..63bce3c895f676d31b270d7ff96d2de699911ee7 100644 (file)
--- a/src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h
+++ b/src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h
@@ -238,13 +238,13 @@ gmx_simd_fneg_ibm_qpx(vector4double a)
  /****************************************************
   * IMPLEMENTATION HELPER FUNCTIONS                  *
   ****************************************************/
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
  gmx_simd_setzero_ibm_qpx(void)
  {
      return vec_splats(0.0);
  }
  
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
  gmx_simd_get_exponent_ibm_qpx(vector4double x)
  {
      const gmx_int64_t    expmask   = 0x7ff0000000000000LL;
@@ -262,7 +262,7 @@ gmx_simd_get_exponent_ibm_qpx(vector4double x)
      return vec_cfid(vec_ld(0, idata));
  }
  
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
  gmx_simd_get_mantissa_ibm_qpx(vector4double x)
  {
      const gmx_int64_t    exp_and_sign_mask = 0xfff0000000000000LL;
@@ -280,7 +280,7 @@ gmx_simd_get_mantissa_ibm_qpx(vector4double x)
      return vec_ld(0, idata);
  }
  
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
  gmx_simd_set_exponent_ibm_qpx(vector4double x)
  {
      const gmx_int64_t    expbase = 1023;
@@ -301,7 +301,7 @@ gmx_simd_set_exponent_ibm_qpx(vector4double x)
      return vec_ld(0, idata);
  }
  
-static __attribute__((always_inline)) double
+static __attribute__((always_inline)) double gmx_simdcall
  gmx_simd_reduce_ibm_qpx(vector4double x)
  {
      vector4double y = vec_sldw(x, x, 2);
@@ -313,7 +313,7 @@ gmx_simd_reduce_ibm_qpx(vector4double x)
      return vec_extract(y, 0);
  }
  
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
  gmx_simd_set1_int_ibm_qpx(int i)
  {
      int idata[4] __attribute__((aligned(32)));
@@ -325,7 +325,7 @@ gmx_simd_set1_int_ibm_qpx(int i)
  }
  
  /* This works in both single and double */
-static __attribute__((always_inline)) int
+static __attribute__((always_inline)) int gmx_simdcall
  gmx_simd_anytrue_bool_ibm_qpx(vector4double a)
  {
      vector4double b = vec_sldw(a, a, 2);
@@ -449,7 +449,7 @@ gmx_simd_anytrue_bool_ibm_qpx(vector4double a)
  #define gmx_simd4_blendv_d               gmx_simd_blendv_d
  #define gmx_simd4_reduce_d               gmx_simd_reduce_d
  
-static __attribute__((always_inline)) double
+static __attribute__((always_inline)) double gmx_simdcall
  gmx_simd4_dotproduct3_d_ibm_qpx(vector4double a, vector4double b)
  {
      vector4double dp_sh0 = vec_mul(a, b);
@@ -460,7 +460,7 @@ gmx_simd4_dotproduct3_d_ibm_qpx(vector4double a, vector4double b)
      return vec_extract(dp, 0);
  }
  
-static __attribute__((always_inline)) float
+static __attribute__((always_inline)) float gmx_simdcall
  gmx_simd4_dotproduct3_f_ibm_qpx(vector4double a, vector4double b)
  {
      return (float)gmx_simd4_dotproduct3_d_ibm_qpx(a, b);
diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h

index 399f20a7bb58992c700e58ec9ead85dfe71e1199..8c62c3a43b8e070a9d81f68376ce218a1ee52f44 100644 (file)
--- a/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h
+++ b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h
@@ -351,13 +351,13 @@
  #define mask_hih _mm512_int2mask(0xFF00)
  
  /* load store float */
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
  gmx_simd_loadu_f_mic(const float * m)
  {
      return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_storeu_f_mic(float * m, __m512 s)
  {
      _mm512_packstorelo_ps(m, s);
@@ -365,13 +365,13 @@ gmx_simd_storeu_f_mic(float * m, __m512 s)
  }
  
  /* load store fint32 */
-static gmx_inline __m512i
+static gmx_inline __m512i gmx_simdcall
  gmx_simd_loadu_fi_mic(const gmx_int32_t * m)
  {
      return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_storeu_fi_mic(gmx_int32_t * m, __m512i s)
  {
      _mm512_packstorelo_epi32(m, s);
@@ -379,13 +379,13 @@ gmx_simd_storeu_fi_mic(gmx_int32_t * m, __m512i s)
  }
  
  /* load store double */
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
  gmx_simd_loadu_d_mic(const double * m)
  {
      return _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_storeu_d_mic(double * m, __m512d s)
  {
      _mm512_packstorelo_pd(m, s);
@@ -393,13 +393,13 @@ gmx_simd_storeu_d_mic(double * m, __m512d s)
  }
  
  /* load store dint32 */
-static gmx_inline __m512i
+static gmx_inline __m512i gmx_simdcall
  gmx_simd_loadu_di_mic(const gmx_int32_t * m)
  {
      return _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m), mask_loh, m+16);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s)
  {
      _mm512_mask_packstorelo_epi32(m, mask_loh, s);
@@ -407,26 +407,26 @@ gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s)
  }
  
  /* load store simd4 */
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
  gmx_simd4_loadu_f_mic(const float * m)
  {
      return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m), gmx_simd4_mask, m+16);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd4_storeu_f_mic(float * m, __m512 s)
  {
      _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s);
      _mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s);
  }
  
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
  gmx_simd4_loadu_d_mic(const double * m)
  {
      return _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m), gmx_simd4_mask, m+8);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd4_storeu_d_mic(double * m, __m512d s)
  {
      _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s);
@@ -434,7 +434,7 @@ gmx_simd4_storeu_d_mic(double * m, __m512d s)
  }
  
  /* extract */
-static gmx_inline gmx_int32_t
+static gmx_inline gmx_int32_t gmx_simdcall
  gmx_simd_extract_fi_mic(gmx_simd_fint32_t a, int index)
  {
      int r;
@@ -442,7 +442,7 @@ gmx_simd_extract_fi_mic(gmx_simd_fint32_t a, int index)
      return r;
  }
  
-static gmx_inline gmx_int32_t
+static gmx_inline gmx_int32_t gmx_simdcall
  gmx_simd_extract_di_mic(gmx_simd_dint32_t a, int index)
  {
      int r;
@@ -453,7 +453,7 @@ gmx_simd_extract_di_mic(gmx_simd_dint32_t a, int index)
  /* This is likely faster than the built in scale operation (lat 8, t-put 3)
   * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
   */
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
  gmx_simd_set_exponent_f_mic(__m512 a)
  {
      __m512i       iexp         = gmx_simd_cvt_f2i(a);
@@ -467,7 +467,7 @@ gmx_simd_set_exponent_f_mic(__m512 a)
       */
  }
  
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
  gmx_simd_set_exponent_d_mic(__m512d a)
  {
      const __m512i expbias      = _mm512_set1_epi32(1023);
@@ -477,7 +477,7 @@ gmx_simd_set_exponent_d_mic(__m512d a)
      return _mm512_castsi512_pd(iexp);
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_cvt_f2dd_mic(__m512 f, __m512d * d0, __m512d * d1)
  {
      __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f), _MM_PERM_CDCD);
@@ -486,7 +486,7 @@ gmx_simd_cvt_f2dd_mic(__m512 f, __m512d * d0, __m512d * d1)
      *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
  }
  
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
  gmx_simd_cvt_dd2f_mic(__m512d d0, __m512d d1)
  {
      __m512 f0 = _mm512_cvtpd_pslo(d0);
@@ -494,13 +494,13 @@ gmx_simd_cvt_dd2f_mic(__m512d d0, __m512d d1)
      return _mm512_mask_permute4f128_ps(f0, mask_hih, f1, PERM_LOW2HIGH);
  }
  
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
  gmx_simd_exp2_f_mic(__m512 x)
  {
      return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
  }
  
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
  gmx_simd_exp_f_mic(__m512 x)
  {
      /* only 59ulp accuracy so we need to do extra an iteration
@@ -511,7 +511,7 @@ gmx_simd_exp_f_mic(__m512 x)
      return _mm512_mask_fmadd_ps(r, m, t, r);
  }
  
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
  gmx_simd_log_f_mic(__m512 x)
  {
      return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h

index 1b995e062100a33021f2cdfef5df49b53733aff7..a05e12ebbb1b26a3c4ad0782f7b07a3ba5f7a569 100644 (file)
--- a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h
+++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h
@@ -130,7 +130,7 @@
  /*********************************************************
   * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
   *********************************************************/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_get_exponent_f_avx2_256(gmx_simd_float_t x)
  {
      const __m256  expmask      = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
@@ -142,7 +142,7 @@ gmx_simd_get_exponent_f_avx2_256(gmx_simd_float_t x)
      return _mm256_cvtepi32_ps(iexp);
  }
  
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_set_exponent_f_avx2_256(gmx_simd_float_t x)
  {
      const __m256i  expbias      = _mm256_set1_epi32(127);
@@ -155,7 +155,7 @@ gmx_simd_set_exponent_f_avx2_256(gmx_simd_float_t x)
  /*********************************************************
   * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
   *********************************************************/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_get_exponent_d_avx2_256(gmx_simd_double_t x)
  {
      const __m256d  expmask      = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL));
@@ -172,7 +172,7 @@ gmx_simd_get_exponent_d_avx2_256(gmx_simd_double_t x)
      return _mm256_cvtepi32_pd(iexp128);
  }
  
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_set_exponent_d_avx2_256(gmx_simd_double_t x)
  {
      const __m256i  expbias      = _mm256_set1_epi64x(1023LL);
@@ -182,7 +182,7 @@ gmx_simd_set_exponent_d_avx2_256(gmx_simd_double_t x)
      return _mm256_castsi256_pd(iexp);
  }
  
-static gmx_inline gmx_simd_dibool_t
+static gmx_inline gmx_simd_dibool_t gmx_simdcall
  gmx_simd_cvt_db2dib_avx2_256(gmx_simd_dbool_t a)
  {
      __m128i ia = _mm256_castsi256_si128(_mm256_castpd_si256(a));
@@ -193,7 +193,7 @@ gmx_simd_cvt_db2dib_avx2_256(gmx_simd_dbool_t a)
      return ia;
  }
  
-static gmx_inline gmx_simd_dbool_t
+static gmx_inline gmx_simd_dbool_t gmx_simdcall
  gmx_simd_cvt_dib2db_avx2_256(gmx_simd_dibool_t ia)
  {
      __m128d lo = _mm_castsi128_pd(_mm_unpacklo_epi32(ia, ia));
diff --git a/src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h b/src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h

index 7d66d007fe99d645b79d74c2eb4bfa6708330c7c..823bfe81b996ef2aaefbfce38b1791c5b8ed6785 100644 (file)
--- a/src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h
+++ b/src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h
@@ -127,7 +127,7 @@
  #define gmx_simd4_cvt_f2d                _mm256_cvtps_pd
  #define gmx_simd4_cvt_d2f                _mm256_cvtpd_ps
  
-static gmx_inline double
+static gmx_inline double gmx_simdcall
  gmx_simd4_reduce_d_avx_128_fma(__m256d a)
  {
      double  f;
@@ -140,7 +140,7 @@ gmx_simd4_reduce_d_avx_128_fma(__m256d a)
      return f;
  }
  
-static gmx_inline double
+static gmx_inline double gmx_simdcall
  gmx_simd4_dotproduct3_d_avx_128_fma(__m256d a, __m256d b)
  {
      double  d;
diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h

index 1b08ec28ec5a8ddb32e8cf23b40a815fcfd3abd1..d787708b4c0f6e44cd68ed55b727f8190774a02b 100644 (file)
--- a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h
+++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h
@@ -343,7 +343,7 @@
  /*********************************************************
   * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
   *********************************************************/
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_simd_get_exponent_f_avx_256(__m256 x)
  {
      const __m256  expmask      = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
@@ -363,7 +363,7 @@ gmx_simd_get_exponent_f_avx_256(__m256 x)
      return _mm256_cvtepi32_ps(iexp256);
  }
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_simd_get_mantissa_f_avx_256(__m256 x)
  {
      const __m256 mantmask   = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
@@ -373,7 +373,7 @@ gmx_simd_get_mantissa_f_avx_256(__m256 x)
      return _mm256_or_ps(x, one);
  }
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_simd_set_exponent_f_avx_256(__m256 x)
  {
      const __m128i expbias      = _mm_set1_epi32(127);
@@ -390,7 +390,7 @@ gmx_simd_set_exponent_f_avx_256(__m256 x)
      return _mm256_castsi256_ps(iexp256);
  }
  
-static gmx_inline float
+static gmx_inline float gmx_simdcall
  gmx_simd_reduce_f_avx_256(__m256 a)
  {
      float  f;
@@ -408,7 +408,7 @@ gmx_simd_reduce_f_avx_256(__m256 a)
  /*********************************************************
   * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
   *********************************************************/
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_simd_get_exponent_d_avx_256(__m256d x)
  {
      const __m256d expmask      = _mm256_castsi256_pd( _mm256_set1_epi64x(0x7FF0000000000000LL));
@@ -428,7 +428,7 @@ gmx_simd_get_exponent_d_avx_256(__m256d x)
      return _mm256_cvtepi32_pd(iexp128a);
  }
  
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_simd_get_mantissa_d_avx_256(__m256d x)
  {
      const __m256d mantmask   = _mm256_castsi256_pd(_mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL));
@@ -438,7 +438,7 @@ gmx_simd_get_mantissa_d_avx_256(__m256d x)
      return _mm256_or_pd(x, one);
  }
  
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
  gmx_simd_set_exponent_d_avx_256(__m256d x)
  {
      const __m128i expbias      = _mm_set1_epi32(1023);
@@ -453,7 +453,7 @@ gmx_simd_set_exponent_d_avx_256(__m256d x)
      return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(iexp128a), iexp128b, 0x1));
  }
  
-static gmx_inline double
+static gmx_inline double gmx_simdcall
  gmx_simd_reduce_d_avx_256(__m256d a)
  {
      double  f;
@@ -466,7 +466,7 @@ gmx_simd_reduce_d_avx_256(__m256d a)
      return f;
  }
  
-static gmx_inline gmx_simd_dibool_t
+static gmx_inline gmx_simd_dibool_t gmx_simdcall
  gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
  {
      __m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
@@ -476,7 +476,7 @@ gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
      return _mm_blend_epi16(a0, a1, 0xF0);
  }
  
-static gmx_inline gmx_simd_dbool_t
+static gmx_inline gmx_simd_dbool_t gmx_simdcall
  gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
  {
      __m128i a1 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
@@ -484,14 +484,14 @@ gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
      return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(a0), a1, 0x1));
  }
  
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_cvt_f2dd_avx_256(__m256 f, __m256d *d0, __m256d *d1)
  {
      *d0 = _mm256_cvtps_pd(_mm256_castps256_ps128(f));
      *d1 = _mm256_cvtps_pd(_mm256_extractf128_ps(f, 0x1));
  }
  
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
  gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
  {
      __m128 f0 = _mm256_cvtpd_ps(d0);
@@ -500,7 +500,7 @@ gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
  }
  
  /* SIMD4 reduce helper */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
  gmx_simd4_reduce_f_avx_256(__m128 a)
  {
      float f;
@@ -511,7 +511,7 @@ gmx_simd4_reduce_f_avx_256(__m128 a)
  }
  
  /* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
  gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
  {
      float  f;
@@ -523,7 +523,7 @@ gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
      return f;
  }
  
-static gmx_inline double
+static gmx_inline double gmx_simdcall
  gmx_simd4_dotproduct3_d_avx_256(__m256d a, __m256d b)
  {
      double  d;
diff --git a/src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h b/src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h

index 20f68f14dec31330d14c776a0c2d38a826dc4186..5d6b195965b25441f3bc1ca030dbfe6d51dd4b98 100644 (file)
--- a/src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h
+++ b/src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h
@@ -249,7 +249,7 @@
  /****************************************************
   * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
   ****************************************************/
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_simd_get_exponent_f_sse2(__m128 x)
  {
      const __m128  expmask      = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
@@ -261,7 +261,7 @@ gmx_simd_get_exponent_f_sse2(__m128 x)
      return _mm_cvtepi32_ps(iexp);
  }
  
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_simd_get_mantissa_f_sse2(__m128 x)
  {
      const __m128 mantmask = _mm_castsi128_ps(_mm_set1_epi32(0x007FFFFF));
@@ -271,7 +271,7 @@ gmx_simd_get_mantissa_f_sse2(__m128 x)
      return _mm_or_ps(x, one);
  }
  
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
  gmx_simd_set_exponent_f_sse2(__m128 x)
  {
      const __m128i expbias      = _mm_set1_epi32(127);
@@ -281,7 +281,7 @@ gmx_simd_set_exponent_f_sse2(__m128 x)
      return _mm_castsi128_ps(iexp);
  }
  
-static gmx_inline __m128i
+static gmx_inline __m128i gmx_simdcall
  gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
  {
      __m128i a1 = _mm_srli_si128(a, 4); /* - a[3] a[2] a[1] */
@@ -295,7 +295,7 @@ gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
      return _mm_unpacklo_epi32(c, c1);
  }
  
-static gmx_inline float
+static gmx_inline float gmx_simdcall
  gmx_simd_reduce_f_sse2(__m128 a)
  {
      __m128 b;
@@ -309,7 +309,7 @@ gmx_simd_reduce_f_sse2(__m128 a)
  /****************************************************
   * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
   ****************************************************/
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_simd_get_exponent_d_sse2(__m128d x)
  {
      /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
@@ -323,7 +323,7 @@ gmx_simd_get_exponent_d_sse2(__m128d x)
      return _mm_cvtepi32_pd(iexp);
  }
  
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_simd_get_mantissa_d_sse2(__m128d x)
  {
      /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
@@ -334,7 +334,7 @@ gmx_simd_get_mantissa_d_sse2(__m128d x)
      return _mm_or_pd(x, one);
  }
  
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
  gmx_simd_set_exponent_d_sse2(__m128d x)
  {
      const __m128i  expbias      = _mm_set1_epi32(1023);
@@ -347,7 +347,7 @@ gmx_simd_set_exponent_d_sse2(__m128d x)
      return _mm_castsi128_pd(iexp);
  }
  
-static gmx_inline __m128i
+static gmx_inline __m128i gmx_simdcall
  gmx_simd_mul_di_sse2(__m128i a, __m128i b)
  {
      __m128i c;
@@ -359,7 +359,7 @@ gmx_simd_mul_di_sse2(__m128i a, __m128i b)
      return _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* 0 0 a[1]*b[1] a[0]*b[0] */
  }
  
-static gmx_inline double
+static gmx_inline double gmx_simdcall
  gmx_simd_reduce_d_sse2(__m128d a)
  {
      __m128d b;
@@ -436,7 +436,7 @@ gmx_simd_check_and_reset_overflow(void)
  #define gmx_simd4_reduce_f               gmx_simd_reduce_f
  
  /* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
  gmx_simd4_dotproduct3_f_sse2(__m128 a, __m128 b)
  {
      float  f;
diff --git a/src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h b/src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h

index ecb46b2447e9133c3999b4665587562adf95d810..946ae1e866a88d35c421166d6823da576360a5cb 100644 (file)
--- a/src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h
+++ b/src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h
@@ -96,7 +96,7 @@
  #define gmx_simd4_dotproduct3_f   gmx_simd4_dotproduct3_f_sse4_1
  
  /* SIMD reduction function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
  gmx_simd_reduce_f_sse4_1(__m128 a)
  {
      float  f;
@@ -108,7 +108,7 @@ gmx_simd_reduce_f_sse4_1(__m128 a)
  }
  
  /* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
  gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
  {
      float f;
@@ -116,7 +116,7 @@ gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
      return f;
  }
  
-static gmx_inline double
+static gmx_inline double gmx_simdcall
  gmx_simd_reduce_d_sse4_1(__m128d a)
  {
      double  f;
diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h

index 4c03a250bf84fff2cb5f666ad7d9304aa75a9643..1f6dc08361ade3a6ee3c77a1cba1c839b10c9a2a 100644 (file)
--- a/src/gromacs/simd/simd_math.h
+++ b/src/gromacs/simd/simd_math.h
@@ -101,7 +101,7 @@
   * \param d term 4 (multiple values)
   * \return sum of terms 1-4 (multiple values)
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
                  gmx_simd_float_t c, gmx_simd_float_t d)
  {
@@ -120,7 +120,7 @@ gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
   * with the exception that negative zero is not considered to be negative
   * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
  {
  #ifdef GMX_SIMD_HAVE_LOGICAL
@@ -139,7 +139,7 @@ gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
   *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
   *  \return   An improved approximation with roughly twice as many bits of accuracy.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
  {
  #    ifdef GMX_SIMD_HAVE_FMA
@@ -156,7 +156,7 @@ gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
   *  \param x Argument that must be >0. This routine does not check arguments.
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_invsqrt_f(gmx_simd_float_t x)
  {
      gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
@@ -184,7 +184,7 @@ gmx_simd_invsqrt_f(gmx_simd_float_t x)
   *  In particular for double precision we can sometimes calculate square root
   *  pairs slightly faster by using single precision until the very last step.
   */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0,    gmx_simd_float_t x1,
                          gmx_simd_float_t *out0, gmx_simd_float_t *out1)
  {
@@ -201,7 +201,7 @@ gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0,    gmx_simd_float_t x1,
   *  \param x  The reference (starting) value x for which we want 1/x.
   *  \return   An improved approximation with roughly twice as many bits of accuracy.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
  {
      return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
@@ -214,7 +214,7 @@ gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
   *  \param x Argument that must be nonzero. This routine does not check arguments.
   *  \return 1/x. Result is undefined if your argument was invalid.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_inv_f(gmx_simd_float_t x)
  {
      gmx_simd_float_t lu = gmx_simd_rcp_f(x);
@@ -238,7 +238,7 @@ gmx_simd_inv_f(gmx_simd_float_t x)
   *  \return sqrt(x). If x=0, the result will correctly be set to 0.
   *          The result is undefined if the input value is negative.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_sqrt_f(gmx_simd_float_t x)
  {
      gmx_simd_fbool_t  mask;
@@ -257,7 +257,7 @@ gmx_simd_sqrt_f(gmx_simd_float_t x)
   * \result The natural logarithm of x. Undefined if argument is invalid.
   */
  #ifndef gmx_simd_log_f
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_log_f(gmx_simd_float_t x)
  {
      const gmx_simd_float_t  half       = gmx_simd_set1_f(0.5f);
@@ -301,7 +301,7 @@ gmx_simd_log_f(gmx_simd_float_t x)
   * \param x Argument.
   * \result 2^x. Undefined if input argument caused overflow.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_exp2_f(gmx_simd_float_t x)
  {
      /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
@@ -348,7 +348,7 @@ gmx_simd_exp2_f(gmx_simd_float_t x)
   * \result exp(x). Undefined if input argument caused overflow,
   * which can happen if abs(x) \> 7e13.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_exp_f(gmx_simd_float_t x)
  {
      const gmx_simd_float_t  argscale     = gmx_simd_set1_f(1.44269504088896341f);
@@ -398,7 +398,7 @@ gmx_simd_exp_f(gmx_simd_float_t x)
   * This routine achieves very close to full precision, but we do not care about
   * the last bit or the subnormal result range.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_erf_f(gmx_simd_float_t x)
  {
      /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
@@ -521,7 +521,7 @@ gmx_simd_erf_f(gmx_simd_float_t x)
   * (think results that are in the ballpark of 10^-30 for single precision,
   * or 10^-200 for double) since that is not relevant for MD.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_erfc_f(gmx_simd_float_t x)
  {
      /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
@@ -699,7 +699,7 @@ gmx_simd_erfc_f(gmx_simd_float_t x)
   * magnitudes of the argument we inherently begin to lose accuracy due to the
   * argument reduction, despite using extended precision arithmetics internally.
   */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
  {
      /* Constants to subtract Pi/4*x from y while minimizing precision loss */
@@ -820,7 +820,7 @@ gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t
   * \attention Do NOT call both sin & cos if you need both results, since each of them
   * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_sin_f(gmx_simd_float_t x)
  {
      gmx_simd_float_t s, c;
@@ -838,7 +838,7 @@ gmx_simd_sin_f(gmx_simd_float_t x)
   * \attention Do NOT call both sin & cos if you need both results, since each of them
   * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_cos_f(gmx_simd_float_t x)
  {
      gmx_simd_float_t s, c;
@@ -853,7 +853,7 @@ gmx_simd_cos_f(gmx_simd_float_t x)
   * \param x The argument to evaluate tan for
   * \result Tan(x)
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_tan_f(gmx_simd_float_t x)
  {
      const gmx_simd_float_t  argred0         = gmx_simd_set1_f(1.5703125);
@@ -929,7 +929,7 @@ gmx_simd_tan_f(gmx_simd_float_t x)
   * \param x The argument to evaluate asin for
   * \result Asin(x)
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_asin_f(gmx_simd_float_t x)
  {
      const gmx_simd_float_t limitlow   = gmx_simd_set1_f(1e-4f);
@@ -981,7 +981,7 @@ gmx_simd_asin_f(gmx_simd_float_t x)
   * \param x The argument to evaluate acos for
   * \result Acos(x)
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_acos_f(gmx_simd_float_t x)
  {
      const gmx_simd_float_t one       = gmx_simd_set1_f(1.0f);
@@ -1018,7 +1018,7 @@ gmx_simd_acos_f(gmx_simd_float_t x)
   * \param x The argument to evaluate atan for
   * \result Atan(x), same argument/value range as standard math library.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_atan_f(gmx_simd_float_t x)
  {
      const gmx_simd_float_t halfpi    = gmx_simd_set1_f(M_PI/2);
@@ -1071,7 +1071,7 @@ gmx_simd_atan_f(gmx_simd_float_t x)
   * of any concern in Gromacs, and in particular it will not affect calculations
   * of angles from vectors.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
  {
      const gmx_simd_float_t pi          = gmx_simd_set1_f(M_PI);
@@ -1179,7 +1179,7 @@ gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
   * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
   *
   */
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
  {
      const gmx_simd_float_t  FN6      = gmx_simd_set1_f(-1.7357322914161492954e-8f);
@@ -1259,7 +1259,7 @@ gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
   * when added to \f$1/r\f$ the error will be insignificant.
   * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
   */
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
  {
      const gmx_simd_float_t  VN6      = gmx_simd_set1_f(1.9296833005951166339e-8f);
@@ -1315,7 +1315,7 @@ gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
   *
   * \copydetails gmx_simd_sum4_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
                  gmx_simd_double_t c, gmx_simd_double_t d)
  {
@@ -1334,7 +1334,7 @@ gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
   * with the exception that negative zero is not considered to be negative
   * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
  {
  #ifdef GMX_SIMD_HAVE_LOGICAL
@@ -1348,7 +1348,7 @@ gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
   *
   * \copydetails gmx_simd_rsqrt_iter_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
  {
  #ifdef GMX_SIMD_HAVE_FMA
@@ -1363,7 +1363,7 @@ gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_invsqrt_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_invsqrt_d(gmx_simd_double_t x)
  {
      gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
@@ -1386,7 +1386,7 @@ gmx_simd_invsqrt_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_invsqrt_pair_f
   */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0,    gmx_simd_double_t x1,
                          gmx_simd_double_t *out0, gmx_simd_double_t *out1)
  {
@@ -1426,7 +1426,7 @@ gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0,    gmx_simd_double_t x1,
   *
   * \copydetails gmx_simd_rcp_iter_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
  {
      return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
@@ -1436,7 +1436,7 @@ gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_inv_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_inv_d(gmx_simd_double_t x)
  {
      gmx_simd_double_t lu = gmx_simd_rcp_d(x);
@@ -1459,7 +1459,7 @@ gmx_simd_inv_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_sqrt_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_sqrt_d(gmx_simd_double_t x)
  {
      gmx_simd_dbool_t   mask;
@@ -1474,7 +1474,7 @@ gmx_simd_sqrt_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_log_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_log_d(gmx_simd_double_t x)
  {
      const gmx_simd_double_t  half       = gmx_simd_set1_d(0.5);
@@ -1519,7 +1519,7 @@ gmx_simd_log_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_exp2_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_exp2_d(gmx_simd_double_t x)
  {
      const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
@@ -1565,7 +1565,7 @@ gmx_simd_exp2_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_exp_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_exp_d(gmx_simd_double_t x)
  {
      const gmx_simd_double_t  argscale      = gmx_simd_set1_d(1.44269504088896340735992468100);
@@ -1618,7 +1618,7 @@ gmx_simd_exp_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_erf_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_erf_d(gmx_simd_double_t x)
  {
      /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
@@ -1803,7 +1803,7 @@ gmx_simd_erf_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_erfc_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_erfc_d(gmx_simd_double_t x)
  {
      /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
@@ -1988,7 +1988,7 @@ gmx_simd_erfc_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_sincos_f
   */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
  {
      /* Constants to subtract Pi/4*x from y while minimizing precision loss */
@@ -2115,7 +2115,7 @@ gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_doubl
   *
   * \copydetails gmx_simd_sin_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_sin_d(gmx_simd_double_t x)
  {
      gmx_simd_double_t s, c;
@@ -2127,7 +2127,7 @@ gmx_simd_sin_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_cos_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_cos_d(gmx_simd_double_t x)
  {
      gmx_simd_double_t s, c;
@@ -2139,7 +2139,7 @@ gmx_simd_cos_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_tan_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_tan_d(gmx_simd_double_t x)
  {
      const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
@@ -2230,7 +2230,7 @@ gmx_simd_tan_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_asin_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_asin_d(gmx_simd_double_t x)
  {
      /* Same algorithm as cephes library */
@@ -2357,7 +2357,7 @@ gmx_simd_asin_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_acos_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_acos_d(gmx_simd_double_t x)
  {
      const gmx_simd_double_t one        = gmx_simd_set1_d(1.0);
@@ -2390,7 +2390,7 @@ gmx_simd_acos_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_atan_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_atan_d(gmx_simd_double_t x)
  {
      /* Same algorithm as cephes library */
@@ -2475,7 +2475,7 @@ gmx_simd_atan_d(gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_atan2_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
  {
      const gmx_simd_double_t pi          = gmx_simd_set1_d(M_PI);
@@ -2507,7 +2507,7 @@ gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
   *
   * \copydetails gmx_simd_pmecorrF_f
   */
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
  {
      const gmx_simd_double_t  FN10     = gmx_simd_set1_d(-8.0072854618360083154e-14);
@@ -2564,7 +2564,7 @@ gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
   *
   * \copydetails gmx_simd_pmecorrV_f
   */
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
  {
      const gmx_simd_double_t  VN9      = gmx_simd_set1_d(-9.3723776169321855475e-13);
@@ -2633,7 +2633,7 @@ gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
   *
   * \copydetails gmx_simd_sum4_f
   */
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
  gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
                   gmx_simd4_float_t c, gmx_simd4_float_t d)
  {
@@ -2644,7 +2644,7 @@ gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
   *
   * \copydetails gmx_simd_rsqrt_iter_f
   */
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
  gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
  {
  #    ifdef GMX_SIMD_HAVE_FMA
@@ -2658,7 +2658,7 @@ gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
   *
   * \copydetails gmx_simd_invsqrt_f
   */
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
  gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
  {
      gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
@@ -2688,7 +2688,7 @@ gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
   *
   * \copydetails gmx_simd_sum4_f
   */
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
  gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
                   gmx_simd4_double_t c, gmx_simd4_double_t d)
  {
@@ -2699,7 +2699,7 @@ gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
   *
   * \copydetails gmx_simd_rsqrt_iter_f
   */
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
  gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
  {
  #ifdef GMX_SIMD_HAVE_FMA
@@ -2713,7 +2713,7 @@ gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
   *
   * \copydetails gmx_simd_invsqrt_f
   */
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
  gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
  {
      gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
diff --git a/src/gromacs/simd/tests/simd4_math.cpp b/src/gromacs/simd/tests/simd4_math.cpp

index 5d08f16869b7fd815e764734c1cb263d3aaacc18..70784dbc78e7cc7b46527375ae33aaef6cc43484 100644 (file)
--- a/src/gromacs/simd/tests/simd4_math.cpp
+++ b/src/gromacs/simd/tests/simd4_math.cpp
@@ -60,7 +60,7 @@ class Simd4MathTest : public Simd4Test
      public:
          ::testing::AssertionResult
                               compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
-                                                      real refFunc(real x),     gmx_simd4_real_t simd4Func(gmx_simd4_real_t x));
+                                                      real refFunc(real x),     gmx_simd4_real_t gmx_simdcall simd4Func(gmx_simd4_real_t x));
  };
  
  /*! \brief Test approximate equality of SIMD4 vs reference version of a function.
@@ -85,7 +85,7 @@ class Simd4MathTest : public Simd4Test
   */
  ::testing::AssertionResult
  Simd4MathTest::compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
-                                        real refFunc(real x),     gmx_simd4_real_t simd4Func(gmx_simd4_real_t x))
+                                        real refFunc(real x),     gmx_simd4_real_t gmx_simdcall simd4Func(gmx_simd4_real_t x))
  {
      std::vector<real>            vx(GMX_SIMD4_WIDTH);
      std::vector<real>            vref(GMX_SIMD4_WIDTH);
diff --git a/src/gromacs/simd/tests/simd_math.cpp b/src/gromacs/simd/tests/simd_math.cpp

index b892070c45a7a6b7c6a1f1d4bb1c9ed004435012..2fab43ca7edbfc8dbd51cc3865194eb822b20da1 100644 (file)
--- a/src/gromacs/simd/tests/simd_math.cpp
+++ b/src/gromacs/simd/tests/simd_math.cpp
@@ -60,7 +60,7 @@ class SimdMathTest : public SimdTest
      public:
          ::testing::AssertionResult
                              compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
-                                                    real refFunc(real x),     gmx_simd_real_t simdFunc(gmx_simd_real_t x));
+                                                    real refFunc(real x),     gmx_simd_real_t gmx_simdcall simdFunc(gmx_simd_real_t x));
  };
  
  /*! \brief Test approximate equality of SIMD vs reference version of a function.
@@ -84,7 +84,7 @@ class SimdMathTest : public SimdTest
   */
  ::testing::AssertionResult
  SimdMathTest::compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
-                                      real refFunc(real x),     gmx_simd_real_t simdFunc(gmx_simd_real_t x))
+                                      real refFunc(real x),     gmx_simd_real_t gmx_simdcall simdFunc(gmx_simd_real_t x))
  {
      std::vector<real>            vx(GMX_SIMD_REAL_WIDTH);
      std::vector<real>            vref(GMX_SIMD_REAL_WIDTH);
@@ -208,7 +208,7 @@ TEST_F(SimdMathTest, gmxSimdInvsqrtR)
  }
  
  /*! \brief Function wrapper to return first result when testing \ref gmx_simd_invsqrt_pair_r */
-gmx_simd_real_t
+gmx_simd_real_t gmx_simdcall
  tst_invsqrt_pair0(gmx_simd_real_t x)
  {
      gmx_simd_real_t r0, r1;
@@ -217,7 +217,7 @@ tst_invsqrt_pair0(gmx_simd_real_t x)
  }
  
  /*! \brief Function wrapper to return second result when testing \ref gmx_simd_invsqrt_pair_r */
-gmx_simd_real_t
+gmx_simd_real_t gmx_simdcall
  tst_invsqrt_pair1(gmx_simd_real_t x)
  {
      gmx_simd_real_t r0, r1;
diff --git a/src/gromacs/simd/vector_operations.h b/src/gromacs/simd/vector_operations.h

index f13957286fc95fc3a5e0420eda2a94ce8a647430..d2ccfde0e2a7ac3f29c9d3d748629dd04e17364e 100644 (file)
--- a/src/gromacs/simd/vector_operations.h
+++ b/src/gromacs/simd/vector_operations.h
@@ -68,7 +68,7 @@
   *
   * \note The SIMD part is that we calculate many scalar products in one call.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_iprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
                   gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz)
  {
@@ -94,7 +94,7 @@ gmx_simd_iprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
   * \note This corresponds to the scalar product of the vector with itself, but
   * the compiler might be able to optimize it better with identical vectors.
   */
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
  gmx_simd_norm2_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az)
  {
      gmx_simd_float_t ret;
@@ -132,7 +132,7 @@ gmx_simd_norm2_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az)
   * The arguments x/y/z denotes the different components, and each element
   * corresponds to a separate vector.
   */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_cprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
                   gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz,
                   gmx_simd_float_t *cx, gmx_simd_float_t *cy, gmx_simd_float_t *cz)
@@ -153,7 +153,7 @@ gmx_simd_cprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
   *
   * \copydetails gmx_simd_iprod_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_iprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
                   gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz)
  {
@@ -170,7 +170,7 @@ gmx_simd_iprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t a
   *
   * \copydetails gmx_simd_norm2_f
   */
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
  gmx_simd_norm2_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az)
  {
      gmx_simd_double_t ret;
@@ -192,7 +192,7 @@ gmx_simd_norm2_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t a
   *
   * \copydetails gmx_simd_cprod_f
   */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
  gmx_simd_cprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
                   gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz,
                   gmx_simd_double_t *cx, gmx_simd_double_t *cy, gmx_simd_double_t *cz)
@@ -214,7 +214,7 @@ gmx_simd_cprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t a
   *
   * \copydetails gmx_simd_norm2_f
   */
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
  gmx_simd4_norm2_f(gmx_simd4_float_t ax, gmx_simd4_float_t ay, gmx_simd4_float_t az)
  {
      gmx_simd4_float_t ret;
@@ -239,7 +239,7 @@ gmx_simd4_norm2_f(gmx_simd4_float_t ax, gmx_simd4_float_t ay, gmx_simd4_float_t
   *
   * \copydetails gmx_simd_norm2_f
   */
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
  gmx_simd4_norm2_d(gmx_simd4_double_t ax, gmx_simd4_double_t ay, gmx_simd4_double_t az)
  {
      gmx_simd4_double_t ret;
author	Erik Lindahl <erik@kth.se>
	Mon, 28 Jul 2014 18:33:42 +0000 (20:33 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Wed, 20 Aug 2014 16:20:57 +0000 (18:20 +0200)
cmake/gmxTestSimd.cmake		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h		patch \| blob \| history
src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h		patch \| blob \| history
src/gromacs/simd/impl_intel_mic/impl_intel_mic.h		patch \| blob \| history
src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h		patch \| blob \| history
src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h		patch \| blob \| history
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h		patch \| blob \| history
src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h		patch \| blob \| history
src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h		patch \| blob \| history
src/gromacs/simd/simd_math.h		patch \| blob \| history
src/gromacs/simd/tests/simd4_math.cpp		patch \| blob \| history
src/gromacs/simd/tests/simd_math.cpp		patch \| blob \| history
src/gromacs/simd/vector_operations.h		patch \| blob \| history