message(STATUS "${SIMD_STATUS_MESSAGE}")
endif()
+# By default, 32-bit windows cannot pass SIMD (SSE/AVX) arguments in registers,
+# and even on 64-bit (all platforms) it is only used for a handful of arguments.
+# The __vectorcall (MSVC, from MSVC2013) or __regcall (ICC) calling conventions
+# enable this, which is critical to enable 32-bit SIMD and improves performance
+# for 64-bit SIMD.
+# Check if the compiler supports one of these, and in that case set gmx_simdcall
+# to that string. If we do not have any such calling convention modifier, set it
+# to an empty string.
+if(NOT DEFINED GMX_SIMD_CALLING_CONVENTION)
+ foreach(callconv __vectorcall __regcall "")
+ set(callconv_compile_var "_callconv_${callconv}")
+ check_c_source_compiles("int ${callconv} f(int i) {return i;} int main(void) {return f(0);}" ${callconv_compile_var})
+ if(${callconv_compile_var})
+ set(GMX_SIMD_CALLING_CONVENTION "${callconv}" CACHE INTERNAL "Calling convention for SIMD routines" FORCE)
+ break()
+ endif()
+ endforeach()
+endif()
+
endmacro()
/* String for SIMD instruction choice (for writing to log files and stdout) */
#define GMX_SIMD_STRING "@GMX_SIMD@"
+/* Calling convention string (if any) for routines with SIMD variable args */
+#define gmx_simdcall @GMX_SIMD_CALLING_CONVENTION@
+
/* Integer byte order is big endian. */
#cmakedefine GMX_INTEGER_BIG_ENDIAN
row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128d a, __m128d b)
{
return _mm_movemask_pd(_mm_cmplt_pd(a, b));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
{
return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
/* Load a double value from 1-2 places, merge into xmm register */
-static __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
}
-static __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm_load_sd(ptrA);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB,
__m128d xmm1)
_mm_store_sd(ptrB, t2);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
_mm_store_sd(ptrA, xmm1);
/* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB, __m128d xmm1)
{
_mm_store_sd(ptrB, t1);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
const double * gmx_restrict p2,
__m128d * gmx_restrict c6,
*c12 = _mm_unpackhi_pd(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict c6,
__m128d * gmx_restrict c12)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
*z = _mm_load_sd(p1+2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
*z3 = _mm_load_sd(p1+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
*z1 = _mm_unpacklo_pd(t3, t4);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _t1 = _mm_sub_pd(_t1, _x1); \
- _t2 = _mm_sub_pd(_t2, _z1); \
- _t3 = _mm_sub_pd(_t3, _y2); \
- _t4 = _mm_sub_pd(_t4, _x3); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+6, t4);
_mm_store_sd(ptrA+8, t5);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _z3 = _mm_unpacklo_pd(_z3, _x4); \
- _y4 = _mm_unpacklo_pd(_y4, _z4); \
- _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
- _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
- _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
- _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
- _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
- _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
}
-#endif
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrB); \
- _t7 = _mm_loadu_pd(ptrB+2); \
- _t8 = _mm_loadu_pd(ptrB+4); \
- _t9 = _mm_loadu_pd(ptrB+6); \
- _t10 = _mm_load_sd(ptrB+8); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpackhi_pd(_z3, _z3); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _t6 = _mm_sub_pd(_t6, _tB); \
- _t7 = _mm_sub_pd(_t7, _tD); \
- _t8 = _mm_sub_pd(_t8, _tF); \
- _t9 = _mm_sub_pd(_t9, _tH); \
- _t10 = _mm_sub_sd(_t10, _tI); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrB, _t6); \
- _mm_storeu_pd(ptrB+2, _t7); \
- _mm_storeu_pd(ptrB+4, _t8); \
- _mm_storeu_pd(ptrB+6, _t9); \
- _mm_store_sd(ptrB+8, _t10); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+6, t9);
_mm_store_sd(ptrB+8, t10);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _t7 = _mm_loadu_pd(ptrB); \
- _t8 = _mm_loadu_pd(ptrB+2); \
- _t9 = _mm_loadu_pd(ptrB+4); \
- _t10 = _mm_loadu_pd(ptrB+6); \
- _t11 = _mm_loadu_pd(ptrB+8); \
- _t12 = _mm_loadu_pd(ptrB+10); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpacklo_pd(_z3, _x4); \
- _tJ = _mm_unpackhi_pd(_z3, _x4); \
- _tK = _mm_unpacklo_pd(_y4, _z4); \
- _tL = _mm_unpackhi_pd(_y4, _z4); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_pd(_t5, _tI); \
- _t6 = _mm_sub_pd(_t6, _tK); \
- _t7 = _mm_sub_pd(_t7, _tB); \
- _t8 = _mm_sub_pd(_t8, _tD); \
- _t9 = _mm_sub_pd(_t9, _tF); \
- _t10 = _mm_sub_pd(_t10, _tH); \
- _t11 = _mm_sub_pd(_t11, _tJ); \
- _t12 = _mm_sub_pd(_t12, _tL); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_storeu_pd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrA+10, _t6); \
- _mm_storeu_pd(ptrB, _t7); \
- _mm_storeu_pd(ptrB+2, _t8); \
- _mm_storeu_pd(ptrB+4, _t9); \
- _mm_storeu_pd(ptrB+6, _t10); \
- _mm_storeu_pd(ptrB+8, _t11); \
- _mm_storeu_pd(ptrB+10, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fiz3); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
- fix1 = _mm_add_pd(fix1, fix3); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- fiz1 = _mm_add_sd(fiz1, _t2); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fix4); \
- fiy4 = _mm_hadd_pd(fiy4, fiz4); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
- _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
- fix3 = _mm_add_pd(fix3, _t2); \
- fix1 = _mm_add_pd(fix1, fix3); \
- fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
- fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
{
pot1 = _mm_hadd_pd(pot1, pot1);
_mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
__m128d pot2, double * gmx_restrict ptrB)
{
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
-static gmx_inline int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128 a, __m128 b)
{
return _mm_movemask_ps(_mm_cmplt_ps(a, b));
}
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
{
return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
/* Load a single value from 1-4 places, merge into xmm register */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
const float * gmx_restrict p2,
const float * gmx_restrict p3,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1)
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
- __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_permute_ps(_z3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t18 = _mm_movehl_ps(_z3, _z3); \
- _t19 = _mm_permute_ps(_t18, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t20 = _mm_movelh_ps(_x1, _z1); \
- _t21 = _mm_movehl_ps(_z1, _x1); \
- _t22 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t23 = _mm_movelh_ps(_y2, _x3); \
- _t24 = _mm_movehl_ps(_x3, _y2); \
- _t25 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_load_ss(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t20); \
- _t2 = _mm_sub_ps(_t2, _t23); \
- _t3 = _mm_sub_ss(_t3, _z3); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_store_ss(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_load_ss(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _t21); \
- _t5 = _mm_sub_ps(_t5, _t24); \
- _t6 = _mm_sub_ss(_t6, _t17); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_store_ss(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_load_ss(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t22); \
- _t8 = _mm_sub_ps(_t8, _t25); \
- _t9 = _mm_sub_ss(_t9, _t18); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_store_ss(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_load_ss(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ss(_t12, _t19); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_store_ss(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_store_ss(ptrD+8, t12);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
- __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
- __m128 _t23, _t24; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_unpackhi_ps(_z3, _x4); \
- _z3 = _mm_unpacklo_ps(_z3, _x4); \
- _t18 = _mm_unpackhi_ps(_y4, _z4); \
- _y4 = _mm_unpacklo_ps(_y4, _z4); \
- _t19 = _mm_movelh_ps(_x1, _z1); \
- _z1 = _mm_movehl_ps(_z1, _x1); \
- _t20 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t21 = _mm_movelh_ps(_y2, _x3); \
- _x3 = _mm_movehl_ps(_x3, _y2); \
- _t22 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t23 = _mm_movelh_ps(_z3, _y4); \
- _y4 = _mm_movehl_ps(_y4, _z3); \
- _t24 = _mm_movelh_ps(_t17, _t18); \
- _t18 = _mm_movehl_ps(_t18, _t17); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_loadu_ps(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t19); \
- _t2 = _mm_sub_ps(_t2, _t21); \
- _t3 = _mm_sub_ps(_t3, _t23); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_storeu_ps(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_loadu_ps(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _z1); \
- _t5 = _mm_sub_ps(_t5, _x3); \
- _t6 = _mm_sub_ps(_t6, _y4); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_storeu_ps(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_loadu_ps(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t20); \
- _t8 = _mm_sub_ps(_t8, _t22); \
- _t9 = _mm_sub_ps(_t9, _t24); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_storeu_ps(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_loadu_ps(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ps(_t12, _t18); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_storeu_ps(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_storeu_ps(ptrD+8, t12);
}
-#endif
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
- _t4 = _mm_load_ss(fshiftptr+2); \
- _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
- _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
- _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
- _t3 = _mm_permute_ps(_t3, _MM_SHUFFLE(1, 2, 0, 0)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _mm_store_ss(fshiftptr+2, _t1); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t1);
_mm_storeh_pi((__m64 *)(fshiftptr), t1);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fix4); \
- fiy4 = _mm_hadd_ps(fiy4, fiz4); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiy4); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
- _t5 = _mm_load_ss(fshiftptr+2); \
- _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
- _t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
- _t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
- _t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
- _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
- _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _t5 = _mm_add_ps(_t5, _t1); \
- _mm_store_ss(fshiftptr+2, _t5); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
pot1 = _mm_hadd_ps(pot1, pot1);
_mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
{
return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
{
return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_set_m128d(__m128d hi, __m128d lo)
{
return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_set_m128(__m128 hi, __m128 lo)
{
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm256_any_lt(__m256d a, __m256d b)
{
return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
}
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
{
return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
/* Load a single value from 1-4 places, merge into xmm register */
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
{
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
{
_mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
{
__m256d t2;
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
{
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
{
__m128d t1;
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
{
__m128d t1, t2;
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
{
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
{
*c6 = _mm256_castpd128_pd256(_mm_load_sd(p1));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
{
__m128d t1, t2, t3;
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
const double * gmx_restrict p3, const double * gmx_restrict p4,
__m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m256d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
{
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
__m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
*z3 = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
__m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD,
__m256d x1, __m256d y1, __m256d z1)
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE; \
- _t1 = _mm256_loadu_pd(ptrA); \
- _t2 = _mm256_loadu_pd(ptrB); \
- _t3 = _mm256_loadu_pd(ptrC); \
- _t4 = _mm256_loadu_pd(ptrD); \
- _t5 = _mm256_loadu_pd(ptrA+4); \
- _t6 = _mm256_loadu_pd(ptrB+4); \
- _t7 = _mm256_loadu_pd(ptrC+4); \
- _t8 = _mm256_loadu_pd(ptrD+4); \
- _tA = _mm_load_sd(ptrA+8); \
- _tB = _mm_load_sd(ptrB+8); \
- _tC = _mm_load_sd(ptrC+8); \
- _tD = _mm_load_sd(ptrD+8); \
- _t9 = _mm256_unpacklo_pd(_x1, _y1); \
- _x1 = _mm256_unpackhi_pd(_x1, _y1); \
- _y1 = _mm256_unpacklo_pd(_z1, _x2); \
- _z1 = _mm256_unpackhi_pd(_z1, _x2); \
- _x2 = _mm256_unpacklo_pd(_y2, _z2); \
- _y2 = _mm256_unpackhi_pd(_y2, _z2); \
- _z2 = _mm256_unpacklo_pd(_x3, _y3); \
- _x3 = _mm256_unpackhi_pd(_x3, _y3); \
- _t10 = gmx_mm256_unpack128lo_pd(_t9, _y1); \
- _y3 = gmx_mm256_unpack128hi_pd(_t9, _y1); \
- _t9 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
- _y1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
- _x1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
- _z1 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
- _x2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
- _z2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
- _t1 = _mm256_sub_pd(_t1, _t10); \
- _t2 = _mm256_sub_pd(_t2, _t9); \
- _t3 = _mm256_sub_pd(_t3, _y3); \
- _t4 = _mm256_sub_pd(_t4, _y1); \
- _t5 = _mm256_sub_pd(_t5, _x1); \
- _t6 = _mm256_sub_pd(_t6, _x2); \
- _t7 = _mm256_sub_pd(_t7, _z1); \
- _t8 = _mm256_sub_pd(_t8, _z2); \
- _tA = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
- _tB = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
- _tE = _mm256_extractf128_pd(_z3, 0x1); \
- _tC = _mm_sub_sd(_tC, _tE); \
- _tD = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
- _mm256_storeu_pd(ptrA, _t1); \
- _mm256_storeu_pd(ptrB, _t2); \
- _mm256_storeu_pd(ptrC, _t3); \
- _mm256_storeu_pd(ptrD, _t4); \
- _mm256_storeu_pd(ptrA+4, _t5); \
- _mm256_storeu_pd(ptrB+4, _t6); \
- _mm256_storeu_pd(ptrC+4, _t7); \
- _mm256_storeu_pd(ptrD+4, _t8); \
- _mm_store_sd(ptrA+8, _tA); \
- _mm_store_sd(ptrB+8, _tB); \
- _mm_store_sd(ptrC+8, _tC); \
- _mm_store_sd(ptrD+8, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD,
__m256d x1, __m256d y1, __m256d z1,
_mm_store_sd(ptrC+8, tC);
_mm_store_sd(ptrD+8, tD);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
- __m128d _tA, _tB, _tC, _tD, _tE; \
- _t1 = _mm256_loadu_pd(ptrA); \
- _t2 = _mm256_loadu_pd(ptrB); \
- _t3 = _mm256_loadu_pd(ptrC); \
- _t4 = _mm256_loadu_pd(ptrD); \
- _t5 = _mm256_loadu_pd(ptrA+4); \
- _t6 = _mm256_loadu_pd(ptrB+4); \
- _t7 = _mm256_loadu_pd(ptrC+4); \
- _t8 = _mm256_loadu_pd(ptrD+4); \
- _t9 = _mm256_loadu_pd(ptrA+8); \
- _t10 = _mm256_loadu_pd(ptrB+8); \
- _t11 = _mm256_loadu_pd(ptrC+8); \
- _t12 = _mm256_loadu_pd(ptrD+8); \
- _t13 = _mm256_unpacklo_pd(_x1, _y1); \
- _x1 = _mm256_unpackhi_pd(_x1, _y1); \
- _y1 = _mm256_unpacklo_pd(_z1, _x2); \
- _z1 = _mm256_unpackhi_pd(_z1, _x2); \
- _x2 = _mm256_unpacklo_pd(_y2, _z2); \
- _y2 = _mm256_unpackhi_pd(_y2, _z2); \
- _z2 = _mm256_unpacklo_pd(_x3, _y3); \
- _x3 = _mm256_unpackhi_pd(_x3, _y3); \
- _y3 = _mm256_unpacklo_pd(_z3, _x4); \
- _z3 = _mm256_unpackhi_pd(_z3, _x4); \
- _x4 = _mm256_unpacklo_pd(_y4, _z4); \
- _y4 = _mm256_unpackhi_pd(_y4, _z4); \
- _z4 = gmx_mm256_unpack128lo_pd(_t13, _y1); \
- _t13 = gmx_mm256_unpack128hi_pd(_t13, _y1); \
- _y1 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
- _x1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
- _z1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
- _x2 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
- _z2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
- _y2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
- _x3 = gmx_mm256_unpack128lo_pd(_y3, _x4); \
- _y3 = gmx_mm256_unpack128hi_pd(_y3, _x4); \
- _x4 = gmx_mm256_unpack128lo_pd(_z3, _y4); \
- _z3 = gmx_mm256_unpack128hi_pd(_z3, _y4); \
- _t1 = _mm256_sub_pd(_t1, _z4); \
- _t2 = _mm256_sub_pd(_t2, _y1); \
- _t3 = _mm256_sub_pd(_t3, _t13); \
- _t4 = _mm256_sub_pd(_t4, _x1); \
- _t5 = _mm256_sub_pd(_t5, _z1); \
- _t6 = _mm256_sub_pd(_t6, _z2); \
- _t7 = _mm256_sub_pd(_t7, _x2); \
- _t8 = _mm256_sub_pd(_t8, _y2); \
- _t9 = _mm256_sub_pd(_t9, _x3); \
- _t10 = _mm256_sub_pd(_t10, _x4); \
- _t11 = _mm256_sub_pd(_t11, _y3); \
- _t12 = _mm256_sub_pd(_t12, _z3); \
- _mm256_storeu_pd(ptrA, _t1); \
- _mm256_storeu_pd(ptrB, _t2); \
- _mm256_storeu_pd(ptrC, _t3); \
- _mm256_storeu_pd(ptrD, _t4); \
- _mm256_storeu_pd(ptrA+4, _t5); \
- _mm256_storeu_pd(ptrB+4, _t6); \
- _mm256_storeu_pd(ptrC+4, _t7); \
- _mm256_storeu_pd(ptrD+4, _t8); \
- _mm256_storeu_pd(ptrA+8, _t9); \
- _mm256_storeu_pd(ptrB+8, _t10); \
- _mm256_storeu_pd(ptrC+8, _t11); \
- _mm256_storeu_pd(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD,
__m256d x1, __m256d y1, __m256d z1,
_mm256_storeu_pd(ptrC+8, t11);
_mm256_storeu_pd(ptrD+8, t12);
}
-#endif
-
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m256d _t1, _t2, _t3, _t4; \
- __m128d _tz3, _tA, _tB, _tC, _tD; \
- fix1 = _mm256_hadd_pd(fix1, fiy1); \
- fiz1 = _mm256_hadd_pd(fiz1, fix2); \
- fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
- fix3 = _mm256_hadd_pd(fix3, fiy3); \
- fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
- _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
- _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
- _t1 = _mm256_add_pd(_t1, _t2); \
- _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
- _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
- _t3 = _mm256_add_pd(_t3, _t4); \
- _tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
- _t2 = _mm256_loadu_pd(fptr); \
- _t4 = _mm256_loadu_pd(fptr+4); \
- _tA = _mm_load_sd(fptr+8); \
- _t2 = _mm256_add_pd(_t2, _t1); \
- _t4 = _mm256_add_pd(_t4, _t3); \
- _tA = _mm_add_sd(_tA, _tz3); \
- _mm256_storeu_pd(fptr, _t2); \
- _mm256_storeu_pd(fptr+4, _t4); \
- _mm_store_sd(fptr+8, _tA); \
- _tB = _mm256_extractf128_pd(_t1, 0x1); \
- _tC = _mm256_extractf128_pd(_t3, 0x1); \
- _tz3 = _mm_add_sd(_tz3, _tB); \
- _tD = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
- _tz3 = _mm_add_sd(_tz3, _tD); \
- _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
- _tD = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
- _tC = _mm_add_pd(_tC, _tD); \
- _tA = _mm_loadu_pd(fshiftptr); \
- _tB = _mm_load_sd(fshiftptr+2); \
- _tA = _mm_add_pd(_tA, _tC); \
- _tB = _mm_add_sd(_tB, _tz3); \
- _mm_storeu_pd(fshiftptr, _tA); \
- _mm_store_sd(fshiftptr+2, _tB); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
__m256d fix2, __m256d fiy2, __m256d fiz2,
__m256d fix3, __m256d fiy3, __m256d fiz3,
_mm_storeu_pd(fshiftptr, tA);
_mm_store_sd(fshiftptr+2, tB);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
- __m128d _tA, _tB, _tC, _tD; \
- fix1 = _mm256_hadd_pd(fix1, fiy1); \
- fiz1 = _mm256_hadd_pd(fiz1, fix2); \
- fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
- fix3 = _mm256_hadd_pd(fix3, fiy3); \
- fiz3 = _mm256_hadd_pd(fiz3, fix4); \
- fiy4 = _mm256_hadd_pd(fiy4, fiz4); \
- _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
- _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
- _t1 = _mm256_add_pd(_t1, _t2); \
- _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
- _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
- _t3 = _mm256_add_pd(_t3, _t4); \
- _t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
- _t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
- _t5 = _mm256_add_pd(_t5, _t6); \
- _t2 = _mm256_loadu_pd(fptr); \
- _t4 = _mm256_loadu_pd(fptr+4); \
- _t6 = _mm256_loadu_pd(fptr+8); \
- _t2 = _mm256_add_pd(_t2, _t1); \
- _t4 = _mm256_add_pd(_t4, _t3); \
- _t6 = _mm256_add_pd(_t6, _t5); \
- _mm256_storeu_pd(fptr, _t2); \
- _mm256_storeu_pd(fptr+4, _t4); \
- _mm256_storeu_pd(fptr+8, _t6); \
- _tA = _mm256_extractf128_pd(_t1, 0x1); \
- _tB = _mm256_extractf128_pd(_t3, 0x1); \
- _tC = _mm256_extractf128_pd(_t5, 0x1); \
- _tB = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
- _tA = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
- _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
- _tD = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
- _tB = _mm_add_pd(_tB, _tD); \
- _tC = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
- _tC = _mm_add_sd(_tC, _tA); \
- _tA = _mm_loadu_pd(fshiftptr); \
- _tD = _mm_load_sd(fshiftptr+2); \
- _tA = _mm_add_pd(_tA, _tB); \
- _tD = _mm_add_sd(_tD, _tC); \
- _mm_storeu_pd(fshiftptr, _tA); \
- _mm_store_sd(fshiftptr+2, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
__m256d fix2, __m256d fiy2, __m256d fiz2,
__m256d fix3, __m256d fiy3, __m256d fiz3,
_mm_storeu_pd(fshiftptr, tA);
_mm_store_sd(fshiftptr+2, tD);
}
-#endif
-
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
{
__m128d t1;
_mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
__m256d pot2, double * gmx_restrict ptrB)
{
#define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
{
return _mm256_permute2f128_ps(xmm1, xmm2, 0x20);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
{
return _mm256_permute2f128_ps(xmm1, xmm2, 0x31);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_set_m128(__m128 hi, __m128 lo)
{
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
{
return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
#define gmx_mm256_sum4_ps(t0, t1, t2, t3) _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
-static gmx_inline int
+static gmx_inline int gmx_simdcall
gmx_mm256_any_lt(__m256 a, __m256 b)
{
return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_LT_OQ));
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
{
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
{
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 xmm1)
_mm_store_ss(ptrD, t4);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
const float * gmx_restrict p3, const float * gmx_restrict p4,
__m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
*c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
const float * gmx_restrict p3, const float * gmx_restrict p4,
const float * gmx_restrict p5, const float * gmx_restrict p6,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m256 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 x1, __m256 y1, __m256 z1)
gmx_mm_maskstore_ps(ptrD, mask, t8);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- x1, y1, z1, x2, y2, z2, x3, y3, z3) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5, _t6; \
- __m128 _tA, _tB, _tC, _tD; \
-\
- _t1 = _mm256_loadu_ps(ptrA); \
- _t2 = _mm256_loadu_ps(ptrB); \
- _t3 = _mm256_loadu_ps(ptrC); \
- _t4 = _mm256_loadu_ps(ptrD); \
- _tA = _mm_load_ss(ptrA+8); \
- _tB = _mm_load_ss(ptrB+8); \
- _tC = _mm_load_ss(ptrC+8); \
- _tD = _mm_load_ss(ptrD+8); \
- _t5 = _mm256_unpacklo_ps(x1, y1); \
- x1 = _mm256_unpackhi_ps(x1, y1); \
- y1 = _mm256_unpacklo_ps(z1, x2); \
- z1 = _mm256_unpackhi_ps(z1, x2); \
- x2 = _mm256_unpacklo_ps(y2, z2); \
- y2 = _mm256_unpackhi_ps(y2, z2); \
- _t6 = _mm256_unpacklo_ps(x3, y3); \
- x3 = _mm256_unpackhi_ps(x3, y3); \
- _t5 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
- x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
- y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1); \
- z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
- z2 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t5 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
- y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
- x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_sub_ps(_t1, z2); \
- _t2 = _mm256_sub_ps(_t2, _t5); \
- _t3 = _mm256_sub_ps(_t3, y1); \
- _t4 = _mm256_sub_ps(_t4, x1); \
- _tA = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3)); \
- _tB = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1))); \
- _tC = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2))); \
- _tD = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3))); \
- _mm256_storeu_ps(ptrA, _t1); \
- _mm256_storeu_ps(ptrB, _t2); \
- _mm256_storeu_ps(ptrC, _t3); \
- _mm256_storeu_ps(ptrD, _t4); \
- _mm_store_ss(ptrA+8, _tA); \
- _mm_store_ss(ptrB+8, _tB); \
- _mm_store_ss(ptrC+8, _tC); \
- _mm_store_ss(ptrD+8, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 x1, __m256 y1, __m256 z1,
_mm_store_ss(ptrC+8, tC);
_mm_store_ss(ptrD+8, tD);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- x1, y1, z1, x2, y2, z2, x3, y3, z3, x4, y4, z4) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5; \
- __m128 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH; \
-\
- _t1 = _mm256_loadu_ps(ptrA); \
- _t2 = _mm256_loadu_ps(ptrB); \
- _t3 = _mm256_loadu_ps(ptrC); \
- _t4 = _mm256_loadu_ps(ptrD); \
- _tA = _mm_loadu_ps(ptrA+8); \
- _tB = _mm_loadu_ps(ptrB+8); \
- _tC = _mm_loadu_ps(ptrC+8); \
- _tD = _mm_loadu_ps(ptrD+8); \
- _t5 = _mm256_unpacklo_ps(x1, y1); \
- x1 = _mm256_unpackhi_ps(x1, y1); \
- y1 = _mm256_unpacklo_ps(z1, x2); \
- z1 = _mm256_unpackhi_ps(z1, x2); \
- x2 = _mm256_unpacklo_ps(y2, z2); \
- y2 = _mm256_unpackhi_ps(y2, z2); \
- z2 = _mm256_unpacklo_ps(x3, y3); \
- x3 = _mm256_unpackhi_ps(x3, y3); \
- y3 = _mm256_unpacklo_ps(z3, x4); \
- z3 = _mm256_unpackhi_ps(z3, x4); \
- x4 = _mm256_unpacklo_ps(y4, z4); \
- y4 = _mm256_unpackhi_ps(y4, z4); \
- x2 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
- x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
- y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); \
- z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
- z2 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t5 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
- y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
- x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
- _tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); \
- _tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); \
- _tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); \
- _tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_sub_ps(_t1, z2); \
- _t2 = _mm256_sub_ps(_t2, _t5); \
- _t3 = _mm256_sub_ps(_t3, y1); \
- _t4 = _mm256_sub_ps(_t4, x1); \
- _tA = _mm_sub_ps(_tA, _tE); \
- _tB = _mm_sub_ps(_tB, _tF); \
- _tC = _mm_sub_ps(_tC, _tG); \
- _tD = _mm_sub_ps(_tD, _tH); \
- _mm256_storeu_ps(ptrA, _t1); \
- _mm256_storeu_ps(ptrB, _t2); \
- _mm256_storeu_ps(ptrC, _t3); \
- _mm256_storeu_ps(ptrD, _t4); \
- _mm_storeu_ps(ptrA+8, _tA); \
- _mm_storeu_ps(ptrB+8, _tB); \
- _mm_storeu_ps(ptrC+8, _tC); \
- _mm_storeu_ps(ptrD+8, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 x1, __m256 y1, __m256 z1,
_mm_storeu_ps(ptrC+8, tC);
_mm_storeu_ps(ptrD+8, tD);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-\
- _tA = _mm256_loadu_ps(ptrA); \
- _tB = _mm256_loadu_ps(ptrB); \
- _tC = _mm256_loadu_ps(ptrC); \
- _tD = _mm256_loadu_ps(ptrD); \
- _tE = _mm256_loadu_ps(ptrE); \
- _tF = _mm256_loadu_ps(ptrF); \
- _tG = _mm256_loadu_ps(ptrG); \
- _tH = _mm256_loadu_ps(ptrH); \
- _t1 = _mm256_unpacklo_ps(_x1, _y1); \
- _t2 = _mm256_unpackhi_ps(_x1, _y1); \
- _t3 = _mm256_unpacklo_ps(_z1, _x2); \
- _t4 = _mm256_unpackhi_ps(_z1, _x2); \
- _t5 = _mm256_unpacklo_ps(_y2, _z2); \
- _t6 = _mm256_unpackhi_ps(_y2, _z2); \
- _t7 = _mm256_unpacklo_ps(_x3, _y3); \
- _t8 = _mm256_unpackhi_ps(_x3, _y3); \
- _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
- _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
- _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
- _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
- _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
- _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
- _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
- _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
- _tA = _mm256_sub_ps(_tA, _t5); \
- _tB = _mm256_sub_ps(_tB, _t7); \
- _tC = _mm256_sub_ps(_tC, _t1); \
- _tD = _mm256_sub_ps(_tD, _t9); \
- _tE = _mm256_sub_ps(_tE, _t6); \
- _tF = _mm256_sub_ps(_tF, _t8); \
- _tG = _mm256_sub_ps(_tG, _t2); \
- _tH = _mm256_sub_ps(_tH, _t10); \
- _mm256_storeu_ps(ptrA, _tA); \
- _mm256_storeu_ps(ptrB, _tB); \
- _mm256_storeu_ps(ptrC, _tC); \
- _mm256_storeu_ps(ptrD, _tD); \
- _mm256_storeu_ps(ptrE, _tE); \
- _mm256_storeu_ps(ptrF, _tF); \
- _mm256_storeu_ps(ptrG, _tG); \
- _mm256_storeu_ps(ptrH, _tH); \
- _tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8)); \
- _tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8)); \
- _tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8)); \
- _tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8)); \
- _tI = _mm256_unpacklo_ps(_tI, _tK); \
- _tJ = _mm256_unpacklo_ps(_tJ, _tL); \
- _tI = _mm256_unpacklo_ps(_tI, _tJ); \
- _tI = _mm256_sub_ps(_tI, _z3); \
- _tJ = _mm256_permute_ps(_tI, _MM_SHUFFLE(1, 1, 1, 1)); \
- _tK = _mm256_permute_ps(_tI, _MM_SHUFFLE(2, 2, 2, 2)); \
- _tL = _mm256_permute_ps(_tI, _MM_SHUFFLE(3, 3, 3, 3)); \
- _mm_store_ss(ptrA+8, _mm256_castps256_ps128(_tI)); \
- _mm_store_ss(ptrB+8, _mm256_castps256_ps128(_tJ)); \
- _mm_store_ss(ptrC+8, _mm256_castps256_ps128(_tK)); \
- _mm_store_ss(ptrD+8, _mm256_castps256_ps128(_tL)); \
- _mm_store_ss(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
- _mm_store_ss(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
- _mm_store_ss(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
- _mm_store_ss(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
_mm_store_ss(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
_mm_store_ss(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-\
- _tA = _mm256_loadu_ps(ptrA); \
- _tB = _mm256_loadu_ps(ptrB); \
- _tC = _mm256_loadu_ps(ptrC); \
- _tD = _mm256_loadu_ps(ptrD); \
- _tE = _mm256_loadu_ps(ptrE); \
- _tF = _mm256_loadu_ps(ptrF); \
- _tG = _mm256_loadu_ps(ptrG); \
- _tH = _mm256_loadu_ps(ptrH); \
- _t1 = _mm256_unpacklo_ps(_x1, _y1); \
- _t2 = _mm256_unpackhi_ps(_x1, _y1); \
- _t3 = _mm256_unpacklo_ps(_z1, _x2); \
- _t4 = _mm256_unpackhi_ps(_z1, _x2); \
- _t5 = _mm256_unpacklo_ps(_y2, _z2); \
- _t6 = _mm256_unpackhi_ps(_y2, _z2); \
- _t7 = _mm256_unpacklo_ps(_x3, _y3); \
- _t8 = _mm256_unpackhi_ps(_x3, _y3); \
- _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
- _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
- _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
- _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
- _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
- _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
- _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
- _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
- _tA = _mm256_sub_ps(_tA, _t5); \
- _tB = _mm256_sub_ps(_tB, _t7); \
- _tC = _mm256_sub_ps(_tC, _t1); \
- _tD = _mm256_sub_ps(_tD, _t9); \
- _tE = _mm256_sub_ps(_tE, _t6); \
- _tF = _mm256_sub_ps(_tF, _t8); \
- _tG = _mm256_sub_ps(_tG, _t2); \
- _tH = _mm256_sub_ps(_tH, _t10); \
- _mm256_storeu_ps(ptrA, _tA); \
- _mm256_storeu_ps(ptrB, _tB); \
- _mm256_storeu_ps(ptrC, _tC); \
- _mm256_storeu_ps(ptrD, _tD); \
- _mm256_storeu_ps(ptrE, _tE); \
- _mm256_storeu_ps(ptrF, _tF); \
- _mm256_storeu_ps(ptrG, _tG); \
- _mm256_storeu_ps(ptrH, _tH); \
- _tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
- _tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
- _tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
- _tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
- _t1 = _mm256_unpacklo_ps(_z3, _x4); \
- _t2 = _mm256_unpackhi_ps(_z3, _x4); \
- _t3 = _mm256_unpacklo_ps(_y4, _z4); \
- _t4 = _mm256_unpackhi_ps(_y4, _z4); \
- _t5 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t6 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t7 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t8 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _tI = _mm256_sub_ps(_tI, _t5); \
- _tJ = _mm256_sub_ps(_tJ, _t6); \
- _tK = _mm256_sub_ps(_tK, _t7); \
- _tL = _mm256_sub_ps(_tL, _t8); \
- _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
- _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
- _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
- _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
- _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
- _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
- _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
- _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
_mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
_mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m256 _t1, _t2, _t3; \
- __m128 _tA, _tB, _tC; \
-\
- fix1 = _mm256_hadd_ps(fix1, fiy1); \
- fiz1 = _mm256_hadd_ps(fiz1, fix2); \
- fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
- fix3 = _mm256_hadd_ps(fix3, fiy3); \
- fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
- fix1 = _mm256_hadd_ps(fix1, fiz1); \
- fiy2 = _mm256_hadd_ps(fiy2, fix3); \
- fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
-\
- _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
- _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
- _t1 = _mm256_add_ps(_t1, _t2); \
- _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
- _t3 = _mm256_loadu_ps(fptr); \
- _t3 = _mm256_add_ps(_t3, _t1); \
- _mm256_storeu_ps(fptr, _t3); \
- _tB = _mm_load_ss(fptr+8); \
- _tB = _mm_add_ss(_tB, _tA); \
- _mm_store_ss(fptr+8, _tB); \
-\
- _tB = _mm256_extractf128_ps(_t1, 0x1); \
- _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
- _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
- _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
- _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
- _tA = _mm_add_ps(_tB, _tC); \
- _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
- _tC = _mm_loadu_ps(fshiftptr); \
- _tC = _mm_add_ps(_tC, _tA); \
- _mm_storeu_ps(fshiftptr, _tC); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
__m256 fix2, __m256 fiy2, __m256 fiz2,
__m256 fix3, __m256 fiy3, __m256 fiz3,
tC = _mm_add_ps(tC, tA);
_mm_storeu_ps(fshiftptr, tC);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m256 _t1, _t2, _t3; \
- __m128 _tA, _tB, _tC; \
-\
- fix1 = _mm256_hadd_ps(fix1, fiy1); \
- fiz1 = _mm256_hadd_ps(fiz1, fix2); \
- fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
- fix3 = _mm256_hadd_ps(fix3, fiy3); \
- fiz3 = _mm256_hadd_ps(fiz3, fix4); \
- fiy4 = _mm256_hadd_ps(fiy4, fiz4); \
-\
- fix1 = _mm256_hadd_ps(fix1, fiz1); \
- fiy2 = _mm256_hadd_ps(fiy2, fix3); \
- fiz3 = _mm256_hadd_ps(fiz3, fiy4); \
-\
- _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
- _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
- _t1 = _mm256_add_ps(_t1, _t2); \
- _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
- _t3 = _mm256_loadu_ps(fptr); \
- _t3 = _mm256_add_ps(_t3, _t1); \
- _mm256_storeu_ps(fptr, _t3); \
- _tB = _mm_loadu_ps(fptr+8); \
- _tB = _mm_add_ps(_tB, _tA); \
- _mm_storeu_ps(fptr+8, _tB); \
-\
- _tB = _mm256_extractf128_ps(_t1, 0x1); \
- _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
- _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
- _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
- _tA = _mm_permute_ps(_tA, _MM_SHUFFLE(0, 3, 2, 1)); \
- _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
- _tA = _mm_add_ps(_tA, _tC); \
- _tA = _mm_add_ps(_tA, _tB); \
- _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
- _tC = _mm_loadu_ps(fshiftptr); \
- _tC = _mm_add_ps(_tC, _tA); \
- _mm_storeu_ps(fshiftptr, _tC); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
__m256 fix2, __m256 fiy2, __m256 fiz2,
__m256 fix3, __m256 fiy3, __m256 fiz3,
tC = _mm_add_ps(tC, tA);
_mm_storeu_ps(fshiftptr, tC);
}
-#endif
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
{
__m128 t1;
_mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
__m256 pot2, float * gmx_restrict ptrB)
{
row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128d a, __m128d b)
{
return _mm_movemask_pd(_mm_cmplt_pd(a, b));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
{
return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
/* Load a double value from 1-2 places, merge into xmm register */
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm_load_sd(ptrA);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB,
__m128d xmm1)
_mm_store_sd(ptrB, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
_mm_store_sd(ptrA, xmm1);
/* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB, __m128d xmm1)
{
_mm_store_sd(ptrB, t1);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
const double * gmx_restrict p2,
__m128d * gmx_restrict c6,
*c12 = _mm_unpackhi_pd(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict c6,
__m128d * gmx_restrict c12)
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
*z = _mm_load_sd(p1+2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
*z3 = _mm_load_sd(p1+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
__m128d xy, __m128d z)
{
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _t1 = _mm_sub_pd(_t1, _x1); \
- _t2 = _mm_sub_pd(_t2, _z1); \
- _t3 = _mm_sub_pd(_t3, _y2); \
- _t4 = _mm_sub_pd(_t4, _x3); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- }
-#else
/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+6, t4);
_mm_store_sd(ptrA+8, t5);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _z3 = _mm_unpacklo_pd(_z3, _x4); \
- _y4 = _mm_unpacklo_pd(_y4, _z4); \
- _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
- _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
- _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
- _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
- _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
- _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
_mm_store_sd(ptrB+2, t4);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrB); \
- _t7 = _mm_loadu_pd(ptrB+2); \
- _t8 = _mm_loadu_pd(ptrB+4); \
- _t9 = _mm_loadu_pd(ptrB+6); \
- _t10 = _mm_load_sd(ptrB+8); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpackhi_pd(_z3, _z3); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _t6 = _mm_sub_pd(_t6, _tB); \
- _t7 = _mm_sub_pd(_t7, _tD); \
- _t8 = _mm_sub_pd(_t8, _tF); \
- _t9 = _mm_sub_pd(_t9, _tH); \
- _t10 = _mm_sub_sd(_t10, _tI); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrB, _t6); \
- _mm_storeu_pd(ptrB+2, _t7); \
- _mm_storeu_pd(ptrB+4, _t8); \
- _mm_storeu_pd(ptrB+6, _t9); \
- _mm_store_sd(ptrB+8, _t10); \
- }
-#else
+
/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+6, t9);
_mm_store_sd(ptrB+8, t10);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _t7 = _mm_loadu_pd(ptrB); \
- _t8 = _mm_loadu_pd(ptrB+2); \
- _t9 = _mm_loadu_pd(ptrB+4); \
- _t10 = _mm_loadu_pd(ptrB+6); \
- _t11 = _mm_loadu_pd(ptrB+8); \
- _t12 = _mm_loadu_pd(ptrB+10); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpacklo_pd(_z3, _x4); \
- _tJ = _mm_unpackhi_pd(_z3, _x4); \
- _tK = _mm_unpacklo_pd(_y4, _z4); \
- _tL = _mm_unpackhi_pd(_y4, _z4); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_pd(_t5, _tI); \
- _t6 = _mm_sub_pd(_t6, _tK); \
- _t7 = _mm_sub_pd(_t7, _tB); \
- _t8 = _mm_sub_pd(_t8, _tD); \
- _t9 = _mm_sub_pd(_t9, _tF); \
- _t10 = _mm_sub_pd(_t10, _tH); \
- _t11 = _mm_sub_pd(_t11, _tJ); \
- _t12 = _mm_sub_pd(_t12, _tL); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_storeu_pd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrA+10, _t6); \
- _mm_storeu_pd(ptrB, _t7); \
- _mm_storeu_pd(ptrB+2, _t8); \
- _mm_storeu_pd(ptrB+4, _t9); \
- _mm_storeu_pd(ptrB+6, _t10); \
- _mm_storeu_pd(ptrB+8, _t11); \
- _mm_storeu_pd(ptrB+10, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+8, t11);
_mm_storeu_pd(ptrB+10, t12);
}
-#endif
-
-
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
- GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
- GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
- _t1 = fix3; \
- fix3 = _mm_unpacklo_pd(fix3, fiy3); \
- fiy3 = _mm_unpackhi_pd(_t1, fiy3); \
- fix1 = _mm_add_pd(fix1, fiy1); \
- fiz1 = _mm_add_pd(fiz1, fix2); \
- fiy2 = _mm_add_pd(fiy2, fiz2); \
- fix3 = _mm_add_pd(fix3, fiy3); \
- fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3)); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
- fix1 = _mm_add_pd(fix1, fix3); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- fiz1 = _mm_add_sd(fiz1, _t2); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
- GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
- GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
- GMX_MM_TRANSPOSE2_PD(fix3, fiy3); \
- GMX_MM_TRANSPOSE2_PD(fiz3, fix4); \
- GMX_MM_TRANSPOSE2_PD(fiy4, fiz4); \
- fix1 = _mm_add_pd(fix1, fiy1); \
- fiz1 = _mm_add_pd(fiz1, fix2); \
- fiy2 = _mm_add_pd(fiy2, fiz2); \
- fix3 = _mm_add_pd(fix3, fiy3); \
- fiz3 = _mm_add_pd(fiz3, fix4); \
- fiy4 = _mm_add_pd(fiy4, fiz4); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
- _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
- fix3 = _mm_add_pd(fix3, _t2); \
- fix1 = _mm_add_pd(fix1, fix3); \
- fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
- fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
{
pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1, pot1));
_mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
__m128d pot2, double * gmx_restrict ptrB)
{
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
{
return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128 a, __m128 b)
{
return _mm_movemask_ps(_mm_cmplt_ps(a, b));
/* Load a single value from 1-4 places, merge into xmm register */
-static __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
return _mm_unpacklo_ps(t1, t2);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
/* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
const float * gmx_restrict p2,
const float * gmx_restrict p3,
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
- __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_shuffle_ps(_z3, _z3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t18 = _mm_movehl_ps(_z3, _z3); \
- _t19 = _mm_shuffle_ps(_t18, _t18, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t20 = _mm_movelh_ps(_x1, _z1); \
- _t21 = _mm_movehl_ps(_z1, _x1); \
- _t22 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t23 = _mm_movelh_ps(_y2, _x3); \
- _t24 = _mm_movehl_ps(_x3, _y2); \
- _t25 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_load_ss(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t20); \
- _t2 = _mm_sub_ps(_t2, _t23); \
- _t3 = _mm_sub_ss(_t3, _z3); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_store_ss(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_load_ss(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _t21); \
- _t5 = _mm_sub_ps(_t5, _t24); \
- _t6 = _mm_sub_ss(_t6, _t17); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_store_ss(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_load_ss(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t22); \
- _t8 = _mm_sub_ps(_t8, _t25); \
- _t9 = _mm_sub_ss(_t9, _t18); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_store_ss(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_load_ss(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ss(_t12, _t19); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_store_ss(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_store_ss(ptrD+8, t12);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
- __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
- __m128 _t23, _t24; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_unpackhi_ps(_z3, _x4); \
- _z3 = _mm_unpacklo_ps(_z3, _x4); \
- _t18 = _mm_unpackhi_ps(_y4, _z4); \
- _y4 = _mm_unpacklo_ps(_y4, _z4); \
- _t19 = _mm_movelh_ps(_x1, _z1); \
- _z1 = _mm_movehl_ps(_z1, _x1); \
- _t20 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t21 = _mm_movelh_ps(_y2, _x3); \
- _x3 = _mm_movehl_ps(_x3, _y2); \
- _t22 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t23 = _mm_movelh_ps(_z3, _y4); \
- _y4 = _mm_movehl_ps(_y4, _z3); \
- _t24 = _mm_movelh_ps(_t17, _t18); \
- _t18 = _mm_movehl_ps(_t18, _t17); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_loadu_ps(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t19); \
- _t2 = _mm_sub_ps(_t2, _t21); \
- _t3 = _mm_sub_ps(_t3, _t23); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_storeu_ps(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_loadu_ps(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _z1); \
- _t5 = _mm_sub_ps(_t5, _x3); \
- _t6 = _mm_sub_ps(_t6, _y4); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_storeu_ps(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_loadu_ps(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t20); \
- _t8 = _mm_sub_ps(_t8, _t22); \
- _t9 = _mm_sub_ps(_t9, _t24); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_storeu_ps(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_loadu_ps(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ps(_t12, _t18); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_storeu_ps(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_storeu_ps(ptrD+8, t12);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4; \
-\
- _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
- _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
- _t2 = _mm_movehl_ps(_mm_setzero_ps(), fiz3); \
- _t1 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t3 = _mm_shuffle_ps(_t2, _t2, _MM_SHUFFLE(0, 0, 0, 1)); \
- fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
- fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
- fiz3 = _mm_add_ss(_mm_add_ps(fiz3, _t1), _mm_add_ps(_t2, _t3)); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
- _t4 = _mm_load_ss(fshiftptr+2); \
- _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
- _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
- _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
- _t3 = _mm_shuffle_ps(_t3, _t3, _MM_SHUFFLE(1, 2, 0, 0)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _mm_store_ss(fshiftptr+2, _t1); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t1);
_mm_storeh_pi((__m64 *)(fshiftptr), t1);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5; \
- _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
- _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
- _MM_TRANSPOSE4_PS(fiz3, fix4, fiy4, fiz4); \
- fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
- fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
- fiz3 = _mm_add_ps(_mm_add_ps(fiz3, fix4), _mm_add_ps(fiy4, fiz4)); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
- _t5 = _mm_load_ss(fshiftptr+2); \
- _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
- _t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
- _t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
- _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
- _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _t5 = _mm_add_ps(_t5, _t1); \
- _mm_store_ss(fshiftptr+2, _t5); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t5);
_mm_storeh_pi((__m64 *)(fshiftptr), t5);
}
-#endif
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
_mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
/* Normal sum of four ymm registers */
#define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128d a, __m128d b)
{
return _mm_movemask_pd(_mm_cmplt_pd(a, b));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
{
return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
/* Load a double value from 1-2 places, merge into xmm register */
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm_load_sd(ptrA);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB,
__m128d xmm1)
_mm_store_sd(ptrB, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
_mm_store_sd(ptrA, xmm1);
/* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB, __m128d xmm1)
{
_mm_store_sd(ptrB, t1);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
const double * gmx_restrict p2,
__m128d * gmx_restrict c6,
*c12 = _mm_unpackhi_pd(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict c6,
__m128d * gmx_restrict c12)
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
*z = _mm_load_sd(p1+2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
*z3 = _mm_load_sd(p1+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
__m128d xy, __m128d z)
{
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _t1 = _mm_sub_pd(_t1, _x1); \
- _t2 = _mm_sub_pd(_t2, _z1); \
- _t3 = _mm_sub_pd(_t3, _y2); \
- _t4 = _mm_sub_pd(_t4, _x3); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+6, t4);
_mm_store_sd(ptrA+8, t5);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _z3 = _mm_unpacklo_pd(_z3, _x4); \
- _y4 = _mm_unpacklo_pd(_y4, _z4); \
- _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
- _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
- _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
- _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
- _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
- _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
_mm_store_sd(ptrB+2, t4);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrB); \
- _t7 = _mm_loadu_pd(ptrB+2); \
- _t8 = _mm_loadu_pd(ptrB+4); \
- _t9 = _mm_loadu_pd(ptrB+6); \
- _t10 = _mm_load_sd(ptrB+8); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpackhi_pd(_z3, _z3); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _t6 = _mm_sub_pd(_t6, _tB); \
- _t7 = _mm_sub_pd(_t7, _tD); \
- _t8 = _mm_sub_pd(_t8, _tF); \
- _t9 = _mm_sub_pd(_t9, _tH); \
- _t10 = _mm_sub_sd(_t10, _tI); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrB, _t6); \
- _mm_storeu_pd(ptrB+2, _t7); \
- _mm_storeu_pd(ptrB+4, _t8); \
- _mm_storeu_pd(ptrB+6, _t9); \
- _mm_store_sd(ptrB+8, _t10); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+6, t9);
_mm_store_sd(ptrB+8, t10);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _t7 = _mm_loadu_pd(ptrB); \
- _t8 = _mm_loadu_pd(ptrB+2); \
- _t9 = _mm_loadu_pd(ptrB+4); \
- _t10 = _mm_loadu_pd(ptrB+6); \
- _t11 = _mm_loadu_pd(ptrB+8); \
- _t12 = _mm_loadu_pd(ptrB+10); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpacklo_pd(_z3, _x4); \
- _tJ = _mm_unpackhi_pd(_z3, _x4); \
- _tK = _mm_unpacklo_pd(_y4, _z4); \
- _tL = _mm_unpackhi_pd(_y4, _z4); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_pd(_t5, _tI); \
- _t6 = _mm_sub_pd(_t6, _tK); \
- _t7 = _mm_sub_pd(_t7, _tB); \
- _t8 = _mm_sub_pd(_t8, _tD); \
- _t9 = _mm_sub_pd(_t9, _tF); \
- _t10 = _mm_sub_pd(_t10, _tH); \
- _t11 = _mm_sub_pd(_t11, _tJ); \
- _t12 = _mm_sub_pd(_t12, _tL); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_storeu_pd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrA+10, _t6); \
- _mm_storeu_pd(ptrB, _t7); \
- _mm_storeu_pd(ptrB+2, _t8); \
- _mm_storeu_pd(ptrB+4, _t9); \
- _mm_storeu_pd(ptrB+6, _t10); \
- _mm_storeu_pd(ptrB+8, _t11); \
- _mm_storeu_pd(ptrB+10, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+8, t11);
_mm_storeu_pd(ptrB+10, t12);
}
-#endif
-
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fiz3); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
- fix1 = _mm_add_pd(fix1, fix3); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- fiz1 = _mm_add_sd(fiz1, _t2); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fix4); \
- fiy4 = _mm_hadd_pd(fiy4, fiz4); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
- _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
- fix3 = _mm_add_pd(fix3, _t2); \
- fix1 = _mm_add_pd(fix1, fix3); \
- fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
- fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
{
pot1 = _mm_hadd_pd(pot1, pot1);
_mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
__m128d pot2, double * gmx_restrict ptrB)
{
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
{
return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
}
-static gmx_inline int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128 a, __m128 b)
{
return _mm_movemask_ps(_mm_cmplt_ps(a, b));
/* Load a single value from 1-4 places, merge into xmm register */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
return _mm_unpacklo_ps(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
/* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
const float * gmx_restrict p2,
const float * gmx_restrict p3,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
float * ptrB,
float * ptrC,
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
- __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_shuffle_ps(_z3, _z3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t18 = _mm_movehl_ps(_z3, _z3); \
- _t19 = _mm_shuffle_ps(_t18, _t18, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t20 = _mm_movelh_ps(_x1, _z1); \
- _t21 = _mm_movehl_ps(_z1, _x1); \
- _t22 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t23 = _mm_movelh_ps(_y2, _x3); \
- _t24 = _mm_movehl_ps(_x3, _y2); \
- _t25 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_load_ss(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t20); \
- _t2 = _mm_sub_ps(_t2, _t23); \
- _t3 = _mm_sub_ss(_t3, _z3); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_store_ss(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_load_ss(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _t21); \
- _t5 = _mm_sub_ps(_t5, _t24); \
- _t6 = _mm_sub_ss(_t6, _t17); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_store_ss(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_load_ss(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t22); \
- _t8 = _mm_sub_ps(_t8, _t25); \
- _t9 = _mm_sub_ss(_t9, _t18); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_store_ss(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_load_ss(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ss(_t12, _t19); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_store_ss(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_store_ss(ptrD+8, t12);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
- __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
- __m128 _t23, _t24; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_unpackhi_ps(_z3, _x4); \
- _z3 = _mm_unpacklo_ps(_z3, _x4); \
- _t18 = _mm_unpackhi_ps(_y4, _z4); \
- _y4 = _mm_unpacklo_ps(_y4, _z4); \
- _t19 = _mm_movelh_ps(_x1, _z1); \
- _z1 = _mm_movehl_ps(_z1, _x1); \
- _t20 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t21 = _mm_movelh_ps(_y2, _x3); \
- _x3 = _mm_movehl_ps(_x3, _y2); \
- _t22 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t23 = _mm_movelh_ps(_z3, _y4); \
- _y4 = _mm_movehl_ps(_y4, _z3); \
- _t24 = _mm_movelh_ps(_t17, _t18); \
- _t18 = _mm_movehl_ps(_t18, _t17); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_loadu_ps(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t19); \
- _t2 = _mm_sub_ps(_t2, _t21); \
- _t3 = _mm_sub_ps(_t3, _t23); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_storeu_ps(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_loadu_ps(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _z1); \
- _t5 = _mm_sub_ps(_t5, _x3); \
- _t6 = _mm_sub_ps(_t6, _y4); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_storeu_ps(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_loadu_ps(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t20); \
- _t8 = _mm_sub_ps(_t8, _t22); \
- _t9 = _mm_sub_ps(_t9, _t24); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_storeu_ps(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_loadu_ps(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ps(_t12, _t18); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_storeu_ps(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_storeu_ps(ptrD+8, t12);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
- _t4 = _mm_load_ss(fshiftptr+2); \
- _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
- _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
- _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
- _t3 = _mm_shuffle_ps(_t3, _t3, _MM_SHUFFLE(1, 2, 0, 0)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _mm_store_ss(fshiftptr+2, _t1); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t1);
_mm_storeh_pi((__m64 *)(fshiftptr), t1);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fix4); \
- fiy4 = _mm_hadd_ps(fiy4, fiz4); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiy4); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
- _t5 = _mm_load_ss(fshiftptr+2); \
- _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
- _t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
- _t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
- _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
- _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _t5 = _mm_add_ps(_t5, _t1); \
- _mm_store_ss(fshiftptr+2, _t5); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t5);
_mm_storeh_pi((__m64 *)(fshiftptr), t5);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
_mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
__m128 *out0, __m128 *out1)
{
}
/* Collect element 2 of the 4 inputs to out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
{
__m128 _c01, _c23;
}
/* Sum the elements within each input register and store the sums in out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_transpose_sum4_pr(__m128 in0, __m128 in1,
__m128 in2, __m128 in3)
{
* prepare_table_load_buffer(), but it is only used with full-width
* AVX_256. */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
__m128 *ctab0_S, __m128 *ctab1_S)
{
gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3], ctab0_S, ctab1_S);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
__m128 *ctab0_S, __m128 *ctab1_S, __m128 *ctabv_S)
{
*ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3]);
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load1_exclfilter(int e)
{
return _mm_set1_epi32(e);
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load_exclusion_filter(const unsigned *i)
{
return gmx_simd_load_i(i);
}
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
#define gmx_sub_hpr _mm_sub_ps
/* Sum over 4 half SIMD registers */
-static __m128 gmx_sum4_hpr(__m256 x, __m256 y)
+static __m128 gmx_simdcall gmx_sum4_hpr(__m256 x, __m256 y)
{
__m256 sum;
*a = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 0x1);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
{
*b = _mm256_extractf128_ps(a, 0);
}
/* Store half width SIMD registers a and b in full width register *c */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
{
*c = _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 0x1);
#endif /* GMX_NBNXN_SIMD_2XNN */
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
__m128 *out0, __m128 *out1)
{
}
/* Collect element 2 of the 4 inputs to out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
{
__m128 _c01, _c23;
}
/* Sum the elements within each input register and return the sums */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_transpose_sum4_pr(__m256 in0, __m256 in1,
__m256 in2, __m256 in3)
{
}
/* Sum the elements of halfs of each input register and return the sums */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_transpose_sum4h_pr(__m256 in0, __m256 in2)
{
in0 = _mm256_hadd_ps(in0, _mm256_setzero_ps());
}
/* Put two 128-bit 4-float registers into one 256-bit 8-float register */
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_2_mm_to_m256(__m128 in0, __m128 in1)
{
return _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1);
* prepare_table_load_buffer(), but it is only used with full-width
* AVX_256. */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
__m256 *ctab0_S, __m256 *ctab1_S)
{
*ctab1_S = gmx_2_mm_to_m256(ctabt_S[2], ctabt_S[3]);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
__m256 *ctab0_S, __m256 *ctab1_S, __m256 *ctabv_S)
{
typedef gmx_simd_int32_t gmx_exclfilter;
static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load1_exclfilter(int e)
{
return _mm256_set1_epi32(e);
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load_exclusion_filter(const unsigned *i)
{
return gmx_simd_load_i(i);
}
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
return _mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_andnot_si256(m0, m1), _mm256_setzero_si256()));
typedef gmx_simd_real_t gmx_exclfilter;
static const int filter_stride = 1;
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load1_exclfilter(int e)
{
return _mm256_castsi256_ps(_mm256_set1_epi32(e));
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load_exclusion_filter(const unsigned *i)
{
return gmx_simd_load_r((real *) (i));
}
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c);
#include "../nbnxn_kernel_simd_utils.h"
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_load_simd_2xnn_interactions(int excl,
gmx_exclfilter filter_S0,
gmx_exclfilter filter_S2,
#include "../nbnxn_kernel_simd_utils.h"
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_load_simd_4xn_interactions(int gmx_unused excl,
gmx_exclfilter gmx_unused filter_S0,
gmx_exclfilter gmx_unused filter_S1,
/****************************************************
* IMPLEMENTATION HELPER FUNCTIONS *
****************************************************/
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_setzero_ibm_qpx(void)
{
return vec_splats(0.0);
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_get_exponent_ibm_qpx(vector4double x)
{
const gmx_int64_t expmask = 0x7ff0000000000000LL;
return vec_cfid(vec_ld(0, idata));
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_get_mantissa_ibm_qpx(vector4double x)
{
const gmx_int64_t exp_and_sign_mask = 0xfff0000000000000LL;
return vec_ld(0, idata);
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_set_exponent_ibm_qpx(vector4double x)
{
const gmx_int64_t expbase = 1023;
return vec_ld(0, idata);
}
-static __attribute__((always_inline)) double
+static __attribute__((always_inline)) double gmx_simdcall
gmx_simd_reduce_ibm_qpx(vector4double x)
{
vector4double y = vec_sldw(x, x, 2);
return vec_extract(y, 0);
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_set1_int_ibm_qpx(int i)
{
int idata[4] __attribute__((aligned(32)));
}
/* This works in both single and double */
-static __attribute__((always_inline)) int
+static __attribute__((always_inline)) int gmx_simdcall
gmx_simd_anytrue_bool_ibm_qpx(vector4double a)
{
vector4double b = vec_sldw(a, a, 2);
#define gmx_simd4_blendv_d gmx_simd_blendv_d
#define gmx_simd4_reduce_d gmx_simd_reduce_d
-static __attribute__((always_inline)) double
+static __attribute__((always_inline)) double gmx_simdcall
gmx_simd4_dotproduct3_d_ibm_qpx(vector4double a, vector4double b)
{
vector4double dp_sh0 = vec_mul(a, b);
return vec_extract(dp, 0);
}
-static __attribute__((always_inline)) float
+static __attribute__((always_inline)) float gmx_simdcall
gmx_simd4_dotproduct3_f_ibm_qpx(vector4double a, vector4double b)
{
return (float)gmx_simd4_dotproduct3_d_ibm_qpx(a, b);
#define mask_hih _mm512_int2mask(0xFF00)
/* load store float */
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_loadu_f_mic(const float * m)
{
return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_f_mic(float * m, __m512 s)
{
_mm512_packstorelo_ps(m, s);
}
/* load store fint32 */
-static gmx_inline __m512i
+static gmx_inline __m512i gmx_simdcall
gmx_simd_loadu_fi_mic(const gmx_int32_t * m)
{
return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_fi_mic(gmx_int32_t * m, __m512i s)
{
_mm512_packstorelo_epi32(m, s);
}
/* load store double */
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
gmx_simd_loadu_d_mic(const double * m)
{
return _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_d_mic(double * m, __m512d s)
{
_mm512_packstorelo_pd(m, s);
}
/* load store dint32 */
-static gmx_inline __m512i
+static gmx_inline __m512i gmx_simdcall
gmx_simd_loadu_di_mic(const gmx_int32_t * m)
{
return _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m), mask_loh, m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s)
{
_mm512_mask_packstorelo_epi32(m, mask_loh, s);
}
/* load store simd4 */
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd4_loadu_f_mic(const float * m)
{
return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m), gmx_simd4_mask, m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd4_storeu_f_mic(float * m, __m512 s)
{
_mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s);
_mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s);
}
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
gmx_simd4_loadu_d_mic(const double * m)
{
return _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m), gmx_simd4_mask, m+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd4_storeu_d_mic(double * m, __m512d s)
{
_mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s);
}
/* extract */
-static gmx_inline gmx_int32_t
+static gmx_inline gmx_int32_t gmx_simdcall
gmx_simd_extract_fi_mic(gmx_simd_fint32_t a, int index)
{
int r;
return r;
}
-static gmx_inline gmx_int32_t
+static gmx_inline gmx_int32_t gmx_simdcall
gmx_simd_extract_di_mic(gmx_simd_dint32_t a, int index)
{
int r;
/* This is likely faster than the built in scale operation (lat 8, t-put 3)
* since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
*/
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_set_exponent_f_mic(__m512 a)
{
__m512i iexp = gmx_simd_cvt_f2i(a);
*/
}
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
gmx_simd_set_exponent_d_mic(__m512d a)
{
const __m512i expbias = _mm512_set1_epi32(1023);
return _mm512_castsi512_pd(iexp);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cvt_f2dd_mic(__m512 f, __m512d * d0, __m512d * d1)
{
__m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f), _MM_PERM_CDCD);
*d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_cvt_dd2f_mic(__m512d d0, __m512d d1)
{
__m512 f0 = _mm512_cvtpd_pslo(d0);
return _mm512_mask_permute4f128_ps(f0, mask_hih, f1, PERM_LOW2HIGH);
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_exp2_f_mic(__m512 x)
{
return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_exp_f_mic(__m512 x)
{
/* only 59ulp accuracy so we need to do extra an iteration
return _mm512_mask_fmadd_ps(r, m, t, r);
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_log_f_mic(__m512 x)
{
return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
/*********************************************************
* SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_get_exponent_f_avx2_256(gmx_simd_float_t x)
{
const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
return _mm256_cvtepi32_ps(iexp);
}
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_set_exponent_f_avx2_256(gmx_simd_float_t x)
{
const __m256i expbias = _mm256_set1_epi32(127);
/*********************************************************
* SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_get_exponent_d_avx2_256(gmx_simd_double_t x)
{
const __m256d expmask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL));
return _mm256_cvtepi32_pd(iexp128);
}
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_set_exponent_d_avx2_256(gmx_simd_double_t x)
{
const __m256i expbias = _mm256_set1_epi64x(1023LL);
return _mm256_castsi256_pd(iexp);
}
-static gmx_inline gmx_simd_dibool_t
+static gmx_inline gmx_simd_dibool_t gmx_simdcall
gmx_simd_cvt_db2dib_avx2_256(gmx_simd_dbool_t a)
{
__m128i ia = _mm256_castsi256_si128(_mm256_castpd_si256(a));
return ia;
}
-static gmx_inline gmx_simd_dbool_t
+static gmx_inline gmx_simd_dbool_t gmx_simdcall
gmx_simd_cvt_dib2db_avx2_256(gmx_simd_dibool_t ia)
{
__m128d lo = _mm_castsi128_pd(_mm_unpacklo_epi32(ia, ia));
#define gmx_simd4_cvt_f2d _mm256_cvtps_pd
#define gmx_simd4_cvt_d2f _mm256_cvtpd_ps
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd4_reduce_d_avx_128_fma(__m256d a)
{
double f;
return f;
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd4_dotproduct3_d_avx_128_fma(__m256d a, __m256d b)
{
double d;
/*********************************************************
* SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_get_exponent_f_avx_256(__m256 x)
{
const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
return _mm256_cvtepi32_ps(iexp256);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_get_mantissa_f_avx_256(__m256 x)
{
const __m256 mantmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
return _mm256_or_ps(x, one);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_set_exponent_f_avx_256(__m256 x)
{
const __m128i expbias = _mm_set1_epi32(127);
return _mm256_castsi256_ps(iexp256);
}
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd_reduce_f_avx_256(__m256 a)
{
float f;
/*********************************************************
* SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_simd_get_exponent_d_avx_256(__m256d x)
{
const __m256d expmask = _mm256_castsi256_pd( _mm256_set1_epi64x(0x7FF0000000000000LL));
return _mm256_cvtepi32_pd(iexp128a);
}
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_simd_get_mantissa_d_avx_256(__m256d x)
{
const __m256d mantmask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL));
return _mm256_or_pd(x, one);
}
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_simd_set_exponent_d_avx_256(__m256d x)
{
const __m128i expbias = _mm_set1_epi32(1023);
return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(iexp128a), iexp128b, 0x1));
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd_reduce_d_avx_256(__m256d a)
{
double f;
return f;
}
-static gmx_inline gmx_simd_dibool_t
+static gmx_inline gmx_simd_dibool_t gmx_simdcall
gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
{
__m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
return _mm_blend_epi16(a0, a1, 0xF0);
}
-static gmx_inline gmx_simd_dbool_t
+static gmx_inline gmx_simd_dbool_t gmx_simdcall
gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
{
__m128i a1 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(a0), a1, 0x1));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cvt_f2dd_avx_256(__m256 f, __m256d *d0, __m256d *d1)
{
*d0 = _mm256_cvtps_pd(_mm256_castps256_ps128(f));
*d1 = _mm256_cvtps_pd(_mm256_extractf128_ps(f, 0x1));
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
{
__m128 f0 = _mm256_cvtpd_ps(d0);
}
/* SIMD4 reduce helper */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_reduce_f_avx_256(__m128 a)
{
float f;
}
/* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
{
float f;
return f;
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd4_dotproduct3_d_avx_256(__m256d a, __m256d b)
{
double d;
/****************************************************
* SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
****************************************************/
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_simd_get_exponent_f_sse2(__m128 x)
{
const __m128 expmask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
return _mm_cvtepi32_ps(iexp);
}
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_simd_get_mantissa_f_sse2(__m128 x)
{
const __m128 mantmask = _mm_castsi128_ps(_mm_set1_epi32(0x007FFFFF));
return _mm_or_ps(x, one);
}
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_simd_set_exponent_f_sse2(__m128 x)
{
const __m128i expbias = _mm_set1_epi32(127);
return _mm_castsi128_ps(iexp);
}
-static gmx_inline __m128i
+static gmx_inline __m128i gmx_simdcall
gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
{
__m128i a1 = _mm_srli_si128(a, 4); /* - a[3] a[2] a[1] */
return _mm_unpacklo_epi32(c, c1);
}
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd_reduce_f_sse2(__m128 a)
{
__m128 b;
/****************************************************
* DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
****************************************************/
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_simd_get_exponent_d_sse2(__m128d x)
{
/* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
return _mm_cvtepi32_pd(iexp);
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_simd_get_mantissa_d_sse2(__m128d x)
{
/* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
return _mm_or_pd(x, one);
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_simd_set_exponent_d_sse2(__m128d x)
{
const __m128i expbias = _mm_set1_epi32(1023);
return _mm_castsi128_pd(iexp);
}
-static gmx_inline __m128i
+static gmx_inline __m128i gmx_simdcall
gmx_simd_mul_di_sse2(__m128i a, __m128i b)
{
__m128i c;
return _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* 0 0 a[1]*b[1] a[0]*b[0] */
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd_reduce_d_sse2(__m128d a)
{
__m128d b;
#define gmx_simd4_reduce_f gmx_simd_reduce_f
/* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_dotproduct3_f_sse2(__m128 a, __m128 b)
{
float f;
#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_sse4_1
/* SIMD reduction function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd_reduce_f_sse4_1(__m128 a)
{
float f;
}
/* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
{
float f;
return f;
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd_reduce_d_sse4_1(__m128d a)
{
double f;
* \param d term 4 (multiple values)
* \return sum of terms 1-4 (multiple values)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
gmx_simd_float_t c, gmx_simd_float_t d)
{
* with the exception that negative zero is not considered to be negative
* on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
{
#ifdef GMX_SIMD_HAVE_LOGICAL
* \param x The reference (starting) value x for which we want 1/sqrt(x).
* \return An improved approximation with roughly twice as many bits of accuracy.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
{
# ifdef GMX_SIMD_HAVE_FMA
* \param x Argument that must be >0. This routine does not check arguments.
* \return 1/sqrt(x). Result is undefined if your argument was invalid.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_invsqrt_f(gmx_simd_float_t x)
{
gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
* In particular for double precision we can sometimes calculate square root
* pairs slightly faster by using single precision until the very last step.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0, gmx_simd_float_t x1,
gmx_simd_float_t *out0, gmx_simd_float_t *out1)
{
* \param x The reference (starting) value x for which we want 1/x.
* \return An improved approximation with roughly twice as many bits of accuracy.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
{
return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
* \param x Argument that must be nonzero. This routine does not check arguments.
* \return 1/x. Result is undefined if your argument was invalid.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_inv_f(gmx_simd_float_t x)
{
gmx_simd_float_t lu = gmx_simd_rcp_f(x);
* \return sqrt(x). If x=0, the result will correctly be set to 0.
* The result is undefined if the input value is negative.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sqrt_f(gmx_simd_float_t x)
{
gmx_simd_fbool_t mask;
* \result The natural logarithm of x. Undefined if argument is invalid.
*/
#ifndef gmx_simd_log_f
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_log_f(gmx_simd_float_t x)
{
const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
* \param x Argument.
* \result 2^x. Undefined if input argument caused overflow.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_exp2_f(gmx_simd_float_t x)
{
/* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
* \result exp(x). Undefined if input argument caused overflow,
* which can happen if abs(x) \> 7e13.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_exp_f(gmx_simd_float_t x)
{
const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f);
* This routine achieves very close to full precision, but we do not care about
* the last bit or the subnormal result range.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_erf_f(gmx_simd_float_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
* (think results that are in the ballpark of 10^-30 for single precision,
* or 10^-200 for double) since that is not relevant for MD.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_erfc_f(gmx_simd_float_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
* magnitudes of the argument we inherently begin to lose accuracy due to the
* argument reduction, despite using extended precision arithmetics internally.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
{
/* Constants to subtract Pi/4*x from y while minimizing precision loss */
* \attention Do NOT call both sin & cos if you need both results, since each of them
* will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sin_f(gmx_simd_float_t x)
{
gmx_simd_float_t s, c;
* \attention Do NOT call both sin & cos if you need both results, since each of them
* will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_cos_f(gmx_simd_float_t x)
{
gmx_simd_float_t s, c;
* \param x The argument to evaluate tan for
* \result Tan(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_tan_f(gmx_simd_float_t x)
{
const gmx_simd_float_t argred0 = gmx_simd_set1_f(1.5703125);
* \param x The argument to evaluate asin for
* \result Asin(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_asin_f(gmx_simd_float_t x)
{
const gmx_simd_float_t limitlow = gmx_simd_set1_f(1e-4f);
* \param x The argument to evaluate acos for
* \result Acos(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_acos_f(gmx_simd_float_t x)
{
const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
* \param x The argument to evaluate atan for
* \result Atan(x), same argument/value range as standard math library.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_atan_f(gmx_simd_float_t x)
{
const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2);
* of any concern in Gromacs, and in particular it will not affect calculations
* of angles from vectors.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
{
const gmx_simd_float_t pi = gmx_simd_set1_f(M_PI);
* For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
*
*/
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
{
const gmx_simd_float_t FN6 = gmx_simd_set1_f(-1.7357322914161492954e-8f);
* when added to \f$1/r\f$ the error will be insignificant.
* For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
*/
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
{
const gmx_simd_float_t VN6 = gmx_simd_set1_f(1.9296833005951166339e-8f);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
gmx_simd_double_t c, gmx_simd_double_t d)
{
* with the exception that negative zero is not considered to be negative
* on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
{
#ifdef GMX_SIMD_HAVE_LOGICAL
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
{
#ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_invsqrt_d(gmx_simd_double_t x)
{
gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
*
* \copydetails gmx_simd_invsqrt_pair_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0, gmx_simd_double_t x1,
gmx_simd_double_t *out0, gmx_simd_double_t *out1)
{
*
* \copydetails gmx_simd_rcp_iter_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
{
return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
*
* \copydetails gmx_simd_inv_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_inv_d(gmx_simd_double_t x)
{
gmx_simd_double_t lu = gmx_simd_rcp_d(x);
*
* \copydetails gmx_simd_sqrt_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sqrt_d(gmx_simd_double_t x)
{
gmx_simd_dbool_t mask;
*
* \copydetails gmx_simd_log_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_log_d(gmx_simd_double_t x)
{
const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
*
* \copydetails gmx_simd_exp2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_exp2_d(gmx_simd_double_t x)
{
const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
*
* \copydetails gmx_simd_exp_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_exp_d(gmx_simd_double_t x)
{
const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896340735992468100);
*
* \copydetails gmx_simd_erf_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_erf_d(gmx_simd_double_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
*
* \copydetails gmx_simd_erfc_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_erfc_d(gmx_simd_double_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
*
* \copydetails gmx_simd_sincos_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
{
/* Constants to subtract Pi/4*x from y while minimizing precision loss */
*
* \copydetails gmx_simd_sin_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sin_d(gmx_simd_double_t x)
{
gmx_simd_double_t s, c;
*
* \copydetails gmx_simd_cos_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_cos_d(gmx_simd_double_t x)
{
gmx_simd_double_t s, c;
*
* \copydetails gmx_simd_tan_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_tan_d(gmx_simd_double_t x)
{
const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978);
*
* \copydetails gmx_simd_asin_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_asin_d(gmx_simd_double_t x)
{
/* Same algorithm as cephes library */
*
* \copydetails gmx_simd_acos_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_acos_d(gmx_simd_double_t x)
{
const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
*
* \copydetails gmx_simd_atan_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_atan_d(gmx_simd_double_t x)
{
/* Same algorithm as cephes library */
*
* \copydetails gmx_simd_atan2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
{
const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
*
* \copydetails gmx_simd_pmecorrF_f
*/
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
{
const gmx_simd_double_t FN10 = gmx_simd_set1_d(-8.0072854618360083154e-14);
*
* \copydetails gmx_simd_pmecorrV_f
*/
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
{
const gmx_simd_double_t VN9 = gmx_simd_set1_d(-9.3723776169321855475e-13);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
gmx_simd4_float_t c, gmx_simd4_float_t d)
{
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
{
# ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
{
gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
gmx_simd4_double_t c, gmx_simd4_double_t d)
{
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
{
#ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
{
gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
public:
::testing::AssertionResult
compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
- real refFunc(real x), gmx_simd4_real_t simd4Func(gmx_simd4_real_t x));
+ real refFunc(real x), gmx_simd4_real_t gmx_simdcall simd4Func(gmx_simd4_real_t x));
};
/*! \brief Test approximate equality of SIMD4 vs reference version of a function.
*/
::testing::AssertionResult
Simd4MathTest::compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
- real refFunc(real x), gmx_simd4_real_t simd4Func(gmx_simd4_real_t x))
+ real refFunc(real x), gmx_simd4_real_t gmx_simdcall simd4Func(gmx_simd4_real_t x))
{
std::vector<real> vx(GMX_SIMD4_WIDTH);
std::vector<real> vref(GMX_SIMD4_WIDTH);
public:
::testing::AssertionResult
compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
- real refFunc(real x), gmx_simd_real_t simdFunc(gmx_simd_real_t x));
+ real refFunc(real x), gmx_simd_real_t gmx_simdcall simdFunc(gmx_simd_real_t x));
};
/*! \brief Test approximate equality of SIMD vs reference version of a function.
*/
::testing::AssertionResult
SimdMathTest::compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
- real refFunc(real x), gmx_simd_real_t simdFunc(gmx_simd_real_t x))
+ real refFunc(real x), gmx_simd_real_t gmx_simdcall simdFunc(gmx_simd_real_t x))
{
std::vector<real> vx(GMX_SIMD_REAL_WIDTH);
std::vector<real> vref(GMX_SIMD_REAL_WIDTH);
}
/*! \brief Function wrapper to return first result when testing \ref gmx_simd_invsqrt_pair_r */
-gmx_simd_real_t
+gmx_simd_real_t gmx_simdcall
tst_invsqrt_pair0(gmx_simd_real_t x)
{
gmx_simd_real_t r0, r1;
}
/*! \brief Function wrapper to return second result when testing \ref gmx_simd_invsqrt_pair_r */
-gmx_simd_real_t
+gmx_simd_real_t gmx_simdcall
tst_invsqrt_pair1(gmx_simd_real_t x)
{
gmx_simd_real_t r0, r1;
*
* \note The SIMD part is that we calculate many scalar products in one call.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_iprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz)
{
* \note This corresponds to the scalar product of the vector with itself, but
* the compiler might be able to optimize it better with identical vectors.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_norm2_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az)
{
gmx_simd_float_t ret;
* The arguments x/y/z denotes the different components, and each element
* corresponds to a separate vector.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz,
gmx_simd_float_t *cx, gmx_simd_float_t *cy, gmx_simd_float_t *cz)
*
* \copydetails gmx_simd_iprod_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_iprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz)
{
*
* \copydetails gmx_simd_norm2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_norm2_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az)
{
gmx_simd_double_t ret;
*
* \copydetails gmx_simd_cprod_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz,
gmx_simd_double_t *cx, gmx_simd_double_t *cy, gmx_simd_double_t *cz)
*
* \copydetails gmx_simd_norm2_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_norm2_f(gmx_simd4_float_t ax, gmx_simd4_float_t ay, gmx_simd4_float_t az)
{
gmx_simd4_float_t ret;
*
* \copydetails gmx_simd_norm2_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_norm2_d(gmx_simd4_double_t ax, gmx_simd4_double_t ay, gmx_simd4_double_t az)
{
gmx_simd4_double_t ret;