-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-\
- _tA = _mm256_loadu_ps(ptrA); \
- _tB = _mm256_loadu_ps(ptrB); \
- _tC = _mm256_loadu_ps(ptrC); \
- _tD = _mm256_loadu_ps(ptrD); \
- _tE = _mm256_loadu_ps(ptrE); \
- _tF = _mm256_loadu_ps(ptrF); \
- _tG = _mm256_loadu_ps(ptrG); \
- _tH = _mm256_loadu_ps(ptrH); \
- _t1 = _mm256_unpacklo_ps(_x1, _y1); \
- _t2 = _mm256_unpackhi_ps(_x1, _y1); \
- _t3 = _mm256_unpacklo_ps(_z1, _x2); \
- _t4 = _mm256_unpackhi_ps(_z1, _x2); \
- _t5 = _mm256_unpacklo_ps(_y2, _z2); \
- _t6 = _mm256_unpackhi_ps(_y2, _z2); \
- _t7 = _mm256_unpacklo_ps(_x3, _y3); \
- _t8 = _mm256_unpackhi_ps(_x3, _y3); \
- _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
- _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
- _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
- _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
- _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
- _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
- _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
- _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
- _tA = _mm256_sub_ps(_tA, _t5); \
- _tB = _mm256_sub_ps(_tB, _t7); \
- _tC = _mm256_sub_ps(_tC, _t1); \
- _tD = _mm256_sub_ps(_tD, _t9); \
- _tE = _mm256_sub_ps(_tE, _t6); \
- _tF = _mm256_sub_ps(_tF, _t8); \
- _tG = _mm256_sub_ps(_tG, _t2); \
- _tH = _mm256_sub_ps(_tH, _t10); \
- _mm256_storeu_ps(ptrA, _tA); \
- _mm256_storeu_ps(ptrB, _tB); \
- _mm256_storeu_ps(ptrC, _tC); \
- _mm256_storeu_ps(ptrD, _tD); \
- _mm256_storeu_ps(ptrE, _tE); \
- _mm256_storeu_ps(ptrF, _tF); \
- _mm256_storeu_ps(ptrG, _tG); \
- _mm256_storeu_ps(ptrH, _tH); \
- _tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
- _tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
- _tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
- _tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
- _t1 = _mm256_unpacklo_ps(_z3, _x4); \
- _t2 = _mm256_unpackhi_ps(_z3, _x4); \
- _t3 = _mm256_unpacklo_ps(_y4, _z4); \
- _t4 = _mm256_unpackhi_ps(_y4, _z4); \
- _t5 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t6 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t7 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t8 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _tI = _mm256_sub_ps(_tI, _t5); \
- _tJ = _mm256_sub_ps(_tJ, _t6); \
- _tK = _mm256_sub_ps(_tK, _t7); \
- _tL = _mm256_sub_ps(_tL, _t8); \
- _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
- _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
- _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
- _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
- _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
- _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
- _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
- _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void