/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef _kernelutil_x86_avx_128_fma_double_h_
#define _kernelutil_x86_avx_128_fma_double_h_
-#include "gromacs/simd/general_x86_avx_128_fma.h"
+#include <math.h>
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <intrin.h>
+#else
+# include <x86intrin.h>
+#endif
+
+#include "config.h"
+
+#define gmx_mm_castsi128_pd _mm_castsi128_pd
+#define gmx_mm_extract_epi32 _mm_extract_epi32
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
+ __m128d __gmx_t1 = row0; \
+ row0 = _mm_unpacklo_pd(row0, row1); \
+ row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128d a, __m128d b)
{
return _mm_movemask_pd(_mm_cmplt_pd(a, b));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
{
return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
/* Load a double value from 1-2 places, merge into xmm register */
-static __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
}
-static __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm_load_sd(ptrA);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB,
__m128d xmm1)
_mm_store_sd(ptrB, t2);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
_mm_store_sd(ptrA, xmm1);
/* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB, __m128d xmm1)
{
_mm_store_sd(ptrB, t1);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
const double * gmx_restrict p2,
__m128d * gmx_restrict c6,
*c12 = _mm_unpackhi_pd(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict c6,
__m128d * gmx_restrict c12)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
*z = _mm_load_sd(p1+2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
*z3 = _mm_load_sd(p1+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
*z1 = _mm_unpacklo_pd(t3, t4);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _t1 = _mm_sub_pd(_t1, _x1); \
- _t2 = _mm_sub_pd(_t2, _z1); \
- _t3 = _mm_sub_pd(_t3, _y2); \
- _t4 = _mm_sub_pd(_t4, _x3); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+6, t4);
_mm_store_sd(ptrA+8, t5);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _z3 = _mm_unpacklo_pd(_z3, _x4); \
- _y4 = _mm_unpacklo_pd(_y4, _z4); \
- _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
- _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
- _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
- _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
- _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
- _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
}
-#endif
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrB); \
- _t7 = _mm_loadu_pd(ptrB+2); \
- _t8 = _mm_loadu_pd(ptrB+4); \
- _t9 = _mm_loadu_pd(ptrB+6); \
- _t10 = _mm_load_sd(ptrB+8); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpackhi_pd(_z3, _z3); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _t6 = _mm_sub_pd(_t6, _tB); \
- _t7 = _mm_sub_pd(_t7, _tD); \
- _t8 = _mm_sub_pd(_t8, _tF); \
- _t9 = _mm_sub_pd(_t9, _tH); \
- _t10 = _mm_sub_sd(_t10, _tI); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrB, _t6); \
- _mm_storeu_pd(ptrB+2, _t7); \
- _mm_storeu_pd(ptrB+4, _t8); \
- _mm_storeu_pd(ptrB+6, _t9); \
- _mm_store_sd(ptrB+8, _t10); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+6, t9);
_mm_store_sd(ptrB+8, t10);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _t7 = _mm_loadu_pd(ptrB); \
- _t8 = _mm_loadu_pd(ptrB+2); \
- _t9 = _mm_loadu_pd(ptrB+4); \
- _t10 = _mm_loadu_pd(ptrB+6); \
- _t11 = _mm_loadu_pd(ptrB+8); \
- _t12 = _mm_loadu_pd(ptrB+10); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpacklo_pd(_z3, _x4); \
- _tJ = _mm_unpackhi_pd(_z3, _x4); \
- _tK = _mm_unpacklo_pd(_y4, _z4); \
- _tL = _mm_unpackhi_pd(_y4, _z4); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_pd(_t5, _tI); \
- _t6 = _mm_sub_pd(_t6, _tK); \
- _t7 = _mm_sub_pd(_t7, _tB); \
- _t8 = _mm_sub_pd(_t8, _tD); \
- _t9 = _mm_sub_pd(_t9, _tF); \
- _t10 = _mm_sub_pd(_t10, _tH); \
- _t11 = _mm_sub_pd(_t11, _tJ); \
- _t12 = _mm_sub_pd(_t12, _tL); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_storeu_pd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrA+10, _t6); \
- _mm_storeu_pd(ptrB, _t7); \
- _mm_storeu_pd(ptrB+2, _t8); \
- _mm_storeu_pd(ptrB+4, _t9); \
- _mm_storeu_pd(ptrB+6, _t10); \
- _mm_storeu_pd(ptrB+8, _t11); \
- _mm_storeu_pd(ptrB+10, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+8, t11);
_mm_storeu_pd(ptrB+10, t12);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fiz3); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
- fix1 = _mm_add_pd(fix1, fix3); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- fiz1 = _mm_add_sd(fiz1, _t2); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fix4); \
- fiy4 = _mm_hadd_pd(fiy4, fiz4); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
- _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
- fix3 = _mm_add_pd(fix3, _t2); \
- fix1 = _mm_add_pd(fix1, fix3); \
- fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
- fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
{
pot1 = _mm_hadd_pd(pot1, pot1);
_mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
__m128d pot2, double * gmx_restrict ptrB)
{