2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_256_single_h_
36 #define _kernelutil_x86_avx_256_single_h_
38 #include "gromacs/simd/general_x86_avx_256.h"
40 /* Transpose lower/upper half of 256-bit registers separately */
41 #define GMX_MM256_HALFTRANSPOSE4_PS(ymm0, ymm1, ymm2, ymm3) { \
42 __m256 __tmp0, __tmp1, __tmp2, __tmp3; \
44 __tmp0 = _mm256_unpacklo_ps((ymm0), (ymm1)); \
45 __tmp1 = _mm256_unpacklo_ps((ymm2), (ymm3)); \
46 __tmp2 = _mm256_unpackhi_ps((ymm0), (ymm1)); \
47 __tmp3 = _mm256_unpackhi_ps((ymm2), (ymm3)); \
48 ymm0 = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(1, 0, 1, 0)); \
49 ymm1 = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(3, 2, 3, 2)); \
50 ymm2 = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(1, 0, 1, 0)); \
51 ymm3 = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(3, 2, 3, 2)); \
55 static gmx_inline __m256
56 gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
58 return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
61 /* Normal sum of four ymm registers */
62 #define gmx_mm256_sum4_ps(t0, t1, t2, t3) _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
66 gmx_mm256_any_lt(__m256 a, __m256 b)
68 return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_LT_OQ));
72 static gmx_inline __m256
73 gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
74 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
78 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
79 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
80 return _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t2));
84 static gmx_inline __m256
85 gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
86 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
87 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
88 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH)
92 t1 = gmx_mm256_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
93 t2 = gmx_mm256_load_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH);
95 return _mm256_permute2f128_ps(t1, t2, 0x20);
100 static gmx_inline void
101 gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
102 float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
106 t2 = _mm256_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
107 t3 = _mm256_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
108 t4 = _mm256_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
109 _mm_store_ss(ptrA, _mm256_castps256_ps128(xmm1));
110 _mm_store_ss(ptrB, _mm256_castps256_ps128(t2));
111 _mm_store_ss(ptrC, _mm256_castps256_ps128(t3));
112 _mm_store_ss(ptrD, _mm256_castps256_ps128(t4));
116 static gmx_inline void
117 gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
118 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
119 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
120 float * gmx_restrict ptrG, float * gmx_restrict ptrH, __m256 xmm1)
124 t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
126 gmx_mm256_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
127 gmx_mm256_store_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
131 static gmx_inline void
132 gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
133 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
136 __m128 t1, t2, t3, t4;
138 t1 = _mm256_castps256_ps128(xmm1);
139 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
140 t3 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
141 t4 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
143 t1 = _mm_add_ss(t1, _mm_load_ss(ptrA));
144 t2 = _mm_add_ss(t2, _mm_load_ss(ptrB));
145 t3 = _mm_add_ss(t3, _mm_load_ss(ptrC));
146 t4 = _mm_add_ss(t4, _mm_load_ss(ptrD));
148 _mm_store_ss(ptrA, t1);
149 _mm_store_ss(ptrB, t2);
150 _mm_store_ss(ptrC, t3);
151 _mm_store_ss(ptrD, t4);
154 static gmx_inline void
155 gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
156 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
157 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
158 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
163 t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
165 gmx_mm256_increment_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
166 gmx_mm256_increment_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
170 static gmx_inline void
171 gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
172 const float * gmx_restrict p3, const float * gmx_restrict p4,
173 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
175 __m128 t1, t2, t3, t4;
177 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1); /* - - c12a c6a */
178 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2); /* - - c12b c6b */
179 t3 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3); /* - - c12c c6c */
180 t4 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4); /* - - c12d c6d */
182 t1 = _mm_unpacklo_ps(t1, t2); /* c12b c12a c6b c6a */
183 t3 = _mm_unpacklo_ps(t3, t4); /* c12d c12c c6d c6c */
185 *c6 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)));
186 *c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)));
189 static gmx_inline void
190 gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
191 const float * gmx_restrict p3, const float * gmx_restrict p4,
192 const float * gmx_restrict p5, const float * gmx_restrict p6,
193 const float * gmx_restrict p7, const float * gmx_restrict p8,
194 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
196 __m256 c6l, c6h, c12l, c12h;
198 gmx_mm256_load_4pair_swizzle_ps(p1, p2, p3, p4, &c6l, &c12l);
199 gmx_mm256_load_4pair_swizzle_ps(p5, p6, p7, p8, &c6h, &c12h);
201 *c6 = _mm256_permute2f128_ps(c6l, c6h, 0x20);
202 *c12 = _mm256_permute2f128_ps(c12l, c12h, 0x20);
206 static gmx_inline void
207 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
208 const float * gmx_restrict xyz,
209 __m256 * gmx_restrict x1,
210 __m256 * gmx_restrict y1,
211 __m256 * gmx_restrict z1)
213 __m128 t1, t2, t3, t4;
215 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
216 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
217 t3 = _mm_load_ss(xyz_shift+2);
218 t4 = _mm_load_ss(xyz+2);
219 t1 = _mm_add_ps(t1, t2);
220 t3 = _mm_add_ss(t3, t4);
222 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
223 t1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
224 t3 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
226 *x1 = gmx_mm256_set_m128(t1, t1);
227 *y1 = gmx_mm256_set_m128(t2, t2);
228 *z1 = gmx_mm256_set_m128(t3, t3);
232 static gmx_inline void
233 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
234 const float * gmx_restrict xyz,
235 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
236 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
237 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
240 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9;
242 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
243 tB = _mm_load_ss(xyz_shift+2);
245 t1 = _mm_loadu_ps(xyz);
246 t2 = _mm_loadu_ps(xyz+4);
247 t3 = _mm_load_ss(xyz+8);
249 tA = _mm_movelh_ps(tA, tB);
250 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
251 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
252 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
254 t1 = _mm_add_ps(t1, t4);
255 t2 = _mm_add_ps(t2, t5);
256 t3 = _mm_add_ss(t3, t6);
258 t9 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
259 t8 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
260 t7 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
261 t6 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
262 t5 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
263 t4 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
264 t3 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
265 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
266 t1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
268 *x1 = gmx_mm256_set_m128(t1, t1);
269 *y1 = gmx_mm256_set_m128(t2, t2);
270 *z1 = gmx_mm256_set_m128(t3, t3);
271 *x2 = gmx_mm256_set_m128(t4, t4);
272 *y2 = gmx_mm256_set_m128(t5, t5);
273 *z2 = gmx_mm256_set_m128(t6, t6);
274 *x3 = gmx_mm256_set_m128(t7, t7);
275 *y3 = gmx_mm256_set_m128(t8, t8);
276 *z3 = gmx_mm256_set_m128(t9, t9);
280 static gmx_inline void
281 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
282 const float * gmx_restrict xyz,
283 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
284 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
285 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
286 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
289 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
291 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
292 tB = _mm_load_ss(xyz_shift+2);
294 t1 = _mm_loadu_ps(xyz);
295 t2 = _mm_loadu_ps(xyz+4);
296 t3 = _mm_loadu_ps(xyz+8);
298 tA = _mm_movelh_ps(tA, tB);
299 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
300 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
301 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
303 t1 = _mm_add_ps(t1, t4);
304 t2 = _mm_add_ps(t2, t5);
305 t3 = _mm_add_ps(t3, t6);
307 t12 = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
308 t11 = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
309 t10 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
310 t9 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
311 t8 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
312 t7 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
313 t6 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
314 t5 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
315 t4 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
316 t3 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
317 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
318 t1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
320 *x1 = gmx_mm256_set_m128(t1, t1);
321 *y1 = gmx_mm256_set_m128(t2, t2);
322 *z1 = gmx_mm256_set_m128(t3, t3);
323 *x2 = gmx_mm256_set_m128(t4, t4);
324 *y2 = gmx_mm256_set_m128(t5, t5);
325 *z2 = gmx_mm256_set_m128(t6, t6);
326 *x3 = gmx_mm256_set_m128(t7, t7);
327 *y3 = gmx_mm256_set_m128(t8, t8);
328 *z3 = gmx_mm256_set_m128(t9, t9);
329 *x4 = gmx_mm256_set_m128(t10, t10);
330 *y4 = gmx_mm256_set_m128(t11, t11);
331 *z4 = gmx_mm256_set_m128(t12, t12);
336 static gmx_inline void
337 gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
338 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
339 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
341 __m128 t1, t2, t3, t4;
342 __m128i mask = _mm_set_epi32(0, -1, -1, -1);
343 t1 = gmx_mm_maskload_ps(ptrA, mask);
344 t2 = gmx_mm_maskload_ps(ptrB, mask);
345 t3 = gmx_mm_maskload_ps(ptrC, mask);
346 t4 = gmx_mm_maskload_ps(ptrD, mask);
347 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
348 *x1 = _mm256_castps128_ps256(t1);
349 *y1 = _mm256_castps128_ps256(t2);
350 *z1 = _mm256_castps128_ps256(t3);
354 static gmx_inline void
355 gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
356 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
357 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
358 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
359 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
361 __m128 t1, t2, t3, t4;
362 t1 = _mm_loadu_ps(ptrA);
363 t2 = _mm_loadu_ps(ptrB);
364 t3 = _mm_loadu_ps(ptrC);
365 t4 = _mm_loadu_ps(ptrD);
366 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
367 *x1 = _mm256_castps128_ps256(t1);
368 *y1 = _mm256_castps128_ps256(t2);
369 *z1 = _mm256_castps128_ps256(t3);
370 *x2 = _mm256_castps128_ps256(t4);
371 t1 = _mm_loadu_ps(ptrA+4);
372 t2 = _mm_loadu_ps(ptrB+4);
373 t3 = _mm_loadu_ps(ptrC+4);
374 t4 = _mm_loadu_ps(ptrD+4);
375 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
376 *y2 = _mm256_castps128_ps256(t1);
377 *z2 = _mm256_castps128_ps256(t2);
378 *x3 = _mm256_castps128_ps256(t3);
379 *y3 = _mm256_castps128_ps256(t4);
380 t1 = _mm_load_ss(ptrA+8);
381 t2 = _mm_load_ss(ptrB+8);
382 t3 = _mm_load_ss(ptrC+8);
383 t4 = _mm_load_ss(ptrD+8);
384 t1 = _mm_unpacklo_ps(t1, t3);
385 t3 = _mm_unpacklo_ps(t2, t4);
386 *z3 = _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t3));
391 static gmx_inline void
392 gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
393 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
394 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
395 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
396 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
397 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
399 __m128 t1, t2, t3, t4;
400 t1 = _mm_loadu_ps(ptrA);
401 t2 = _mm_loadu_ps(ptrB);
402 t3 = _mm_loadu_ps(ptrC);
403 t4 = _mm_loadu_ps(ptrD);
404 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
405 *x1 = _mm256_castps128_ps256(t1);
406 *y1 = _mm256_castps128_ps256(t2);
407 *z1 = _mm256_castps128_ps256(t3);
408 *x2 = _mm256_castps128_ps256(t4);
409 t1 = _mm_loadu_ps(ptrA+4);
410 t2 = _mm_loadu_ps(ptrB+4);
411 t3 = _mm_loadu_ps(ptrC+4);
412 t4 = _mm_loadu_ps(ptrD+4);
413 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
414 *y2 = _mm256_castps128_ps256(t1);
415 *z2 = _mm256_castps128_ps256(t2);
416 *x3 = _mm256_castps128_ps256(t3);
417 *y3 = _mm256_castps128_ps256(t4);
418 t1 = _mm_loadu_ps(ptrA+8);
419 t2 = _mm_loadu_ps(ptrB+8);
420 t3 = _mm_loadu_ps(ptrC+8);
421 t4 = _mm_loadu_ps(ptrD+8);
422 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
423 *z3 = _mm256_castps128_ps256(t1);
424 *x4 = _mm256_castps128_ps256(t2);
425 *y4 = _mm256_castps128_ps256(t3);
426 *z4 = _mm256_castps128_ps256(t4);
430 static gmx_inline void
431 gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
432 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
433 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
434 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
435 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
437 __m256 t1, t2, t3, t4, t5, t6, t7, t8;
438 __m128i mask = _mm_set_epi32(0, -1, -1, -1);
440 t1 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask)); /* - zE yE xE | - zA yA xA */
441 t2 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask)); /* - zF yF xF | - zB yB xB */
442 t3 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask)); /* - zG yG xG | - zC yC xC */
443 t4 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask)); /* - zH yH xH | - zD yD xD */
445 t5 = _mm256_unpacklo_ps(t1, t2); /* yF yE xF xE | yB yA xB xA */
446 t6 = _mm256_unpacklo_ps(t3, t4); /* yH yG xH xG | yD yC xD xC */
447 t7 = _mm256_unpackhi_ps(t1, t2); /* - - zF zE | - - zB zA */
448 t8 = _mm256_unpackhi_ps(t3, t4); /* - - zH zG | - - zD zC */
450 *x1 = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(1, 0, 1, 0));
451 *y1 = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(3, 2, 3, 2));
452 *z1 = _mm256_shuffle_ps(t7, t8, _MM_SHUFFLE(1, 0, 1, 0));
456 static gmx_inline void
457 gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
458 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
459 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
460 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
461 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
462 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
463 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
465 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
467 t1 = _mm256_loadu_ps(ptrA); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
468 t2 = _mm256_loadu_ps(ptrB); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
469 t3 = _mm256_loadu_ps(ptrC); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
470 t4 = _mm256_loadu_ps(ptrD); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
471 t5 = _mm256_loadu_ps(ptrE); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
472 t6 = _mm256_loadu_ps(ptrF); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
473 t7 = _mm256_loadu_ps(ptrG); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
474 t8 = _mm256_loadu_ps(ptrH); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
476 t9 = _mm256_unpacklo_ps(t1, t2); /* z2b z2a y2b y2a | y1b y1a x1b x1a */
477 t10 = _mm256_unpackhi_ps(t1, t2); /* y3b y3a x3b x3a | x2b x2a z1b z1a */
478 t11 = _mm256_unpacklo_ps(t3, t4); /* z2d z2c y2d y2c | y1d y1c x1d x1c */
479 t12 = _mm256_unpackhi_ps(t3, t4); /* y3d y3c x3d x3c | x2d x2c z1d z1c */
480 t1 = _mm256_unpacklo_ps(t5, t6); /* z2f z2e y2f y2e | y1f y1e x1f x1e */
481 t2 = _mm256_unpackhi_ps(t5, t6); /* y3f y3e x3f x3e | x2f x2e z1f z1e */
482 t3 = _mm256_unpacklo_ps(t7, t8); /* z2h z2g y2h y2g | y1h y1g x1h x1g */
483 t4 = _mm256_unpackhi_ps(t7, t8); /* y3h y3g x3h x3g | x2h x2g z1h z1g */
485 t5 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0)); /* y2d y2c y2b y2a | x1d x1c x1b x1a */
486 t6 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2)); /* z2d z2c z2b z2a | y1d y1c y1b y1a */
487 t7 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
488 t8 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
490 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* y2h y2g y2f y2e | x1h x1g x1f x1e */
491 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z2h z2g z2f z2e | y1h y1g y1f y1e */
492 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x3h x3g x3f x3e | z1h z1g z1f z1e */
493 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h y3g y3f y3e | x2h x2g x2f x2e */
495 *x1 = _mm256_permute2f128_ps(t5, t9, 0x20);
496 *y1 = _mm256_permute2f128_ps(t6, t10, 0x20);
497 *z1 = _mm256_permute2f128_ps(t7, t11, 0x20);
498 *x2 = _mm256_permute2f128_ps(t8, t12, 0x20);
500 *y2 = _mm256_permute2f128_ps(t5, t9, 0x31);
501 *z2 = _mm256_permute2f128_ps(t6, t10, 0x31);
502 *x3 = _mm256_permute2f128_ps(t7, t11, 0x31);
503 *y3 = _mm256_permute2f128_ps(t8, t12, 0x31);
505 t1 = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
506 t2 = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
507 t3 = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
508 t4 = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
510 t1 = _mm256_unpacklo_ps(t1, t3); /* - - z3g z3e | - - z3c z3a */
511 t2 = _mm256_unpacklo_ps(t2, t4); /* - - z3h z3f | - - z3d z3b */
513 *z3 = _mm256_unpacklo_ps(t1, t2);
518 static gmx_inline void
519 gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
520 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
521 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
522 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
523 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
524 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
525 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
526 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
528 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
530 t1 = _mm256_loadu_ps(ptrA); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
531 t2 = _mm256_loadu_ps(ptrB); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
532 t3 = _mm256_loadu_ps(ptrC); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
533 t4 = _mm256_loadu_ps(ptrD); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
534 t5 = _mm256_loadu_ps(ptrE); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
535 t6 = _mm256_loadu_ps(ptrF); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
536 t7 = _mm256_loadu_ps(ptrG); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
537 t8 = _mm256_loadu_ps(ptrH); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
539 t9 = _mm256_unpacklo_ps(t1, t2); /* z2b z2a y2b y2a | y1b y1a x1b x1a */
540 t10 = _mm256_unpackhi_ps(t1, t2); /* y3b y3a x3b x3a | x2b x2a z1b z1a */
541 t11 = _mm256_unpacklo_ps(t3, t4); /* z2d z2c y2d y2c | y1d y1c x1d x1c */
542 t12 = _mm256_unpackhi_ps(t3, t4); /* y3d y3c x3d x3c | x2d x2c z1d z1c */
543 t1 = _mm256_unpacklo_ps(t5, t6); /* z2f z2e y2f y2e | y1f y1e x1f x1e */
544 t2 = _mm256_unpackhi_ps(t5, t6); /* y3f y3e x3f x3e | x2f x2e z1f z1e */
545 t3 = _mm256_unpacklo_ps(t7, t8); /* z2h z2g y2h y2g | y1h y1g x1h x1g */
546 t4 = _mm256_unpackhi_ps(t7, t8); /* y3h y3g x3h x3g | x2h x2g z1h z1g */
548 t5 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0)); /* y2d y2c y2b y2a | x1d x1c x1b x1a */
549 t6 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2)); /* z2d z2c z2b z2a | y1d y1c y1b y1a */
550 t7 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
551 t8 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
552 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* y2h y2g y2f y2e | x1h x1g x1f x1e */
553 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z2h z2g z2f z2e | y1h y1g y1f y1e */
554 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x3h x3g x3f x3e | z1h z1g z1f z1e */
555 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h y3g y3f y3e | x2h x2g x2f x2e */
557 *x1 = _mm256_permute2f128_ps(t5, t9, 0x20);
558 *y1 = _mm256_permute2f128_ps(t6, t10, 0x20);
559 *z1 = _mm256_permute2f128_ps(t7, t11, 0x20);
560 *x2 = _mm256_permute2f128_ps(t8, t12, 0x20);
562 *y2 = _mm256_permute2f128_ps(t5, t9, 0x31);
563 *z2 = _mm256_permute2f128_ps(t6, t10, 0x31);
564 *x3 = _mm256_permute2f128_ps(t7, t11, 0x31);
565 *y3 = _mm256_permute2f128_ps(t8, t12, 0x31);
567 t1 = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
568 t2 = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
569 t3 = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
570 t4 = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
572 t5 = _mm256_unpacklo_ps(t1, t2); /* x4f x4e z3f z3e | x4b x4a z3b z3a */
573 t6 = _mm256_unpackhi_ps(t1, t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
574 t7 = _mm256_unpacklo_ps(t3, t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
575 t8 = _mm256_unpackhi_ps(t3, t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
577 *z3 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
578 *x4 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
579 *y4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
580 *z4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* z4h z4g z4f z4e | z4d z4c z4b z4a */
584 static gmx_inline void
585 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
586 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
587 __m256 x1, __m256 y1, __m256 z1)
589 __m128 t1, t2, t3, t4, t5, t6, t7, t8;
592 /* Construct a mask without executing any data loads */
593 mask = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
595 t3 = _mm_unpacklo_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1b x1b y1a x1a */
596 t4 = _mm_unpackhi_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1d x1d y1c x1c */
598 t1 = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 0, 1, 0)); /* - z1a y1a x1a */
599 t2 = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 1, 3, 2)); /* - z1b y1b x1b */
600 t3 = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 2, 1, 0)); /* - z1c y1c x1c */
601 t4 = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 3, 3, 2)); /* - z1d y1d x1d */
603 t5 = gmx_mm_maskload_ps(ptrA, mask);
604 t6 = gmx_mm_maskload_ps(ptrB, mask);
605 t7 = gmx_mm_maskload_ps(ptrC, mask);
606 t8 = gmx_mm_maskload_ps(ptrD, mask);
608 t5 = _mm_sub_ps(t5, t1);
609 t6 = _mm_sub_ps(t6, t2);
610 t7 = _mm_sub_ps(t7, t3);
611 t8 = _mm_sub_ps(t8, t4);
613 gmx_mm_maskstore_ps(ptrA, mask, t5);
614 gmx_mm_maskstore_ps(ptrB, mask, t6);
615 gmx_mm_maskstore_ps(ptrC, mask, t7);
616 gmx_mm_maskstore_ps(ptrD, mask, t8);
619 #if defined (_MSC_VER) && defined(_M_IX86)
620 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
621 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
622 x1, y1, z1, x2, y2, z2, x3, y3, z3) \
624 __m256 _t1, _t2, _t3, _t4, _t5, _t6; \
625 __m128 _tA, _tB, _tC, _tD; \
627 _t1 = _mm256_loadu_ps(ptrA); \
628 _t2 = _mm256_loadu_ps(ptrB); \
629 _t3 = _mm256_loadu_ps(ptrC); \
630 _t4 = _mm256_loadu_ps(ptrD); \
631 _tA = _mm_load_ss(ptrA+8); \
632 _tB = _mm_load_ss(ptrB+8); \
633 _tC = _mm_load_ss(ptrC+8); \
634 _tD = _mm_load_ss(ptrD+8); \
635 _t5 = _mm256_unpacklo_ps(x1, y1); \
636 x1 = _mm256_unpackhi_ps(x1, y1); \
637 y1 = _mm256_unpacklo_ps(z1, x2); \
638 z1 = _mm256_unpackhi_ps(z1, x2); \
639 x2 = _mm256_unpacklo_ps(y2, z2); \
640 y2 = _mm256_unpackhi_ps(y2, z2); \
641 _t6 = _mm256_unpacklo_ps(x3, y3); \
642 x3 = _mm256_unpackhi_ps(x3, y3); \
643 _t5 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
644 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
645 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1); \
646 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
647 z2 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
648 _t5 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
649 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
650 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
651 _t1 = _mm256_sub_ps(_t1, z2); \
652 _t2 = _mm256_sub_ps(_t2, _t5); \
653 _t3 = _mm256_sub_ps(_t3, y1); \
654 _t4 = _mm256_sub_ps(_t4, x1); \
655 _tA = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3)); \
656 _tB = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1))); \
657 _tC = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2))); \
658 _tD = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3))); \
659 _mm256_storeu_ps(ptrA, _t1); \
660 _mm256_storeu_ps(ptrB, _t2); \
661 _mm256_storeu_ps(ptrC, _t3); \
662 _mm256_storeu_ps(ptrD, _t4); \
663 _mm_store_ss(ptrA+8, _tA); \
664 _mm_store_ss(ptrB+8, _tB); \
665 _mm_store_ss(ptrC+8, _tC); \
666 _mm_store_ss(ptrD+8, _tD); \
669 /* Real function for sane compilers */
670 static gmx_inline void
671 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
672 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
673 __m256 x1, __m256 y1, __m256 z1,
674 __m256 x2, __m256 y2, __m256 z2,
675 __m256 x3, __m256 y3, __m256 z3)
677 __m256 t1, t2, t3, t4, t5, t6;
678 __m128 tA, tB, tC, tD;
680 t1 = _mm256_loadu_ps(ptrA);
681 t2 = _mm256_loadu_ps(ptrB);
682 t3 = _mm256_loadu_ps(ptrC);
683 t4 = _mm256_loadu_ps(ptrD);
684 tA = _mm_load_ss(ptrA+8);
685 tB = _mm_load_ss(ptrB+8);
686 tC = _mm_load_ss(ptrC+8);
687 tD = _mm_load_ss(ptrD+8);
689 t5 = _mm256_unpacklo_ps(x1, y1); /* - - - - | y1b x1b y1a x1a */
690 x1 = _mm256_unpackhi_ps(x1, y1); /* - - - - | y1d x1d y1c x1c */
691 y1 = _mm256_unpacklo_ps(z1, x2); /* - - - - | x2b z1b x2a z1a */
692 z1 = _mm256_unpackhi_ps(z1, x2); /* - - - - | x2d z1d x2c z1c */
694 x2 = _mm256_unpacklo_ps(y2, z2); /* - - - - | z2b y2b z2a y2a */
695 y2 = _mm256_unpackhi_ps(y2, z2); /* - - - - | z2d y2d z2c y2c */
696 t6 = _mm256_unpacklo_ps(x3, y3); /* - - - - | y3b x3b y3a x3a */
697 x3 = _mm256_unpackhi_ps(x3, y3); /* - - - - | y3d x3d y3c x3c */
699 t5 = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
700 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
702 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(t6), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
703 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
705 z2 = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
706 t5 = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
707 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
708 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
710 t1 = _mm256_sub_ps(t1, z2);
711 t2 = _mm256_sub_ps(t2, t5);
712 t3 = _mm256_sub_ps(t3, y1);
713 t4 = _mm256_sub_ps(t4, x1);
715 tA = _mm_sub_ss(tA, _mm256_castps256_ps128(z3));
716 tB = _mm_sub_ss(tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1)));
717 tC = _mm_sub_ss(tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2)));
718 tD = _mm_sub_ss(tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3)));
720 /* Here we store a full 256-bit value and a separate 32-bit one; no overlap can happen */
721 _mm256_storeu_ps(ptrA, t1);
722 _mm256_storeu_ps(ptrB, t2);
723 _mm256_storeu_ps(ptrC, t3);
724 _mm256_storeu_ps(ptrD, t4);
725 _mm_store_ss(ptrA+8, tA);
726 _mm_store_ss(ptrB+8, tB);
727 _mm_store_ss(ptrC+8, tC);
728 _mm_store_ss(ptrD+8, tD);
734 #if defined (_MSC_VER) && defined(_M_IX86)
735 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
736 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
737 x1, y1, z1, x2, y2, z2, x3, y3, z3, x4, y4, z4) \
739 __m256 _t1, _t2, _t3, _t4, _t5; \
740 __m128 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH; \
742 _t1 = _mm256_loadu_ps(ptrA); \
743 _t2 = _mm256_loadu_ps(ptrB); \
744 _t3 = _mm256_loadu_ps(ptrC); \
745 _t4 = _mm256_loadu_ps(ptrD); \
746 _tA = _mm_loadu_ps(ptrA+8); \
747 _tB = _mm_loadu_ps(ptrB+8); \
748 _tC = _mm_loadu_ps(ptrC+8); \
749 _tD = _mm_loadu_ps(ptrD+8); \
750 _t5 = _mm256_unpacklo_ps(x1, y1); \
751 x1 = _mm256_unpackhi_ps(x1, y1); \
752 y1 = _mm256_unpacklo_ps(z1, x2); \
753 z1 = _mm256_unpackhi_ps(z1, x2); \
754 x2 = _mm256_unpacklo_ps(y2, z2); \
755 y2 = _mm256_unpackhi_ps(y2, z2); \
756 z2 = _mm256_unpacklo_ps(x3, y3); \
757 x3 = _mm256_unpackhi_ps(x3, y3); \
758 y3 = _mm256_unpacklo_ps(z3, x4); \
759 z3 = _mm256_unpackhi_ps(z3, x4); \
760 x4 = _mm256_unpacklo_ps(y4, z4); \
761 y4 = _mm256_unpackhi_ps(y4, z4); \
762 x2 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
763 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
764 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); \
765 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
766 z2 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
767 _t5 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
768 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
769 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
770 _tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); \
771 _tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); \
772 _tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); \
773 _tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); \
774 _t1 = _mm256_sub_ps(_t1, z2); \
775 _t2 = _mm256_sub_ps(_t2, _t5); \
776 _t3 = _mm256_sub_ps(_t3, y1); \
777 _t4 = _mm256_sub_ps(_t4, x1); \
778 _tA = _mm_sub_ps(_tA, _tE); \
779 _tB = _mm_sub_ps(_tB, _tF); \
780 _tC = _mm_sub_ps(_tC, _tG); \
781 _tD = _mm_sub_ps(_tD, _tH); \
782 _mm256_storeu_ps(ptrA, _t1); \
783 _mm256_storeu_ps(ptrB, _t2); \
784 _mm256_storeu_ps(ptrC, _t3); \
785 _mm256_storeu_ps(ptrD, _t4); \
786 _mm_storeu_ps(ptrA+8, _tA); \
787 _mm_storeu_ps(ptrB+8, _tB); \
788 _mm_storeu_ps(ptrC+8, _tC); \
789 _mm_storeu_ps(ptrD+8, _tD); \
792 /* Real function for sane compilers */
793 static gmx_inline void
794 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
795 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
796 __m256 x1, __m256 y1, __m256 z1,
797 __m256 x2, __m256 y2, __m256 z2,
798 __m256 x3, __m256 y3, __m256 z3,
799 __m256 x4, __m256 y4, __m256 z4)
801 __m256 t1, t2, t3, t4, t5;
802 __m128 tA, tB, tC, tD, tE, tF, tG, tH;
804 t1 = _mm256_loadu_ps(ptrA);
805 t2 = _mm256_loadu_ps(ptrB);
806 t3 = _mm256_loadu_ps(ptrC);
807 t4 = _mm256_loadu_ps(ptrD);
808 tA = _mm_loadu_ps(ptrA+8);
809 tB = _mm_loadu_ps(ptrB+8);
810 tC = _mm_loadu_ps(ptrC+8);
811 tD = _mm_loadu_ps(ptrD+8);
813 t5 = _mm256_unpacklo_ps(x1, y1); /* - - - - | y1b x1b y1a x1a */
814 x1 = _mm256_unpackhi_ps(x1, y1); /* - - - - | y1d x1d y1c x1c */
815 y1 = _mm256_unpacklo_ps(z1, x2); /* - - - - | x2b z1b x2a z1a */
816 z1 = _mm256_unpackhi_ps(z1, x2); /* - - - - | x2d z1d x2c z1c */
818 x2 = _mm256_unpacklo_ps(y2, z2); /* - - - - | z2b y2b z2a y2a */
819 y2 = _mm256_unpackhi_ps(y2, z2); /* - - - - | z2d y2d z2c y2c */
820 z2 = _mm256_unpacklo_ps(x3, y3); /* - - - - | y3b x3b y3a x3a */
821 x3 = _mm256_unpackhi_ps(x3, y3); /* - - - - | y3d x3d y3c x3c */
823 y3 = _mm256_unpacklo_ps(z3, x4); /* - - - - | x4b z3b x4a z3a */
824 z3 = _mm256_unpackhi_ps(z3, x4); /* - - - - | x4d z3d x4c z3c */
825 x4 = _mm256_unpacklo_ps(y4, z4); /* - - - - | z4b y4b z4a y4a */
826 y4 = _mm256_unpackhi_ps(y4, z4); /* - - - - | z4d y4d z4c y4c */
828 x2 = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
829 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
830 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
831 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
833 z2 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
834 t5 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
835 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
836 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
838 tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4a y4a x4a z3a */
839 tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4b y4b x4b z3b */
841 tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4c y4c x4c z3c */
842 tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4d y4d x4d z3d */
844 t1 = _mm256_sub_ps(t1, z2);
845 t2 = _mm256_sub_ps(t2, t5);
846 t3 = _mm256_sub_ps(t3, y1);
847 t4 = _mm256_sub_ps(t4, x1);
849 tA = _mm_sub_ps(tA, tE);
850 tB = _mm_sub_ps(tB, tF);
851 tC = _mm_sub_ps(tC, tG);
852 tD = _mm_sub_ps(tD, tH);
854 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
855 _mm256_storeu_ps(ptrA, t1);
856 _mm256_storeu_ps(ptrB, t2);
857 _mm256_storeu_ps(ptrC, t3);
858 _mm256_storeu_ps(ptrD, t4);
859 _mm_storeu_ps(ptrA+8, tA);
860 _mm_storeu_ps(ptrB+8, tB);
861 _mm_storeu_ps(ptrC+8, tC);
862 _mm_storeu_ps(ptrD+8, tD);
867 static gmx_inline void
868 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
869 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
870 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
871 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
872 __m256 x1, __m256 y1, __m256 z1)
874 __m256 t1, t2, t3, t4, t5, t6;
875 __m256 tA, tB, tC, tD;
878 /* Construct a mask without executing any data loads */
879 mask = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
881 tA = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask));
882 tB = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask));
883 tC = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask));
884 tD = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask));
885 t1 = _mm256_unpacklo_ps(x1, y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
886 t2 = _mm256_unpackhi_ps(x1, y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
888 t3 = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 0, 1, 0)); /* - z1e y1e x1e | - z1a y1a x1a */
889 t4 = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 1, 3, 2)); /* - z1f y1f x1f | - z1b y1b x1b */
890 t5 = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 2, 1, 0)); /* - z1g y1g x1g | - z1c y1c x1c */
891 t6 = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 3, 3, 2)); /* - z1h y1h x1h | - z1d y1d x1d */
893 tA = _mm256_sub_ps(tA, t3);
894 tB = _mm256_sub_ps(tB, t4);
895 tC = _mm256_sub_ps(tC, t5);
896 tD = _mm256_sub_ps(tD, t6);
898 gmx_mm_maskstore_ps(ptrA, mask, _mm256_castps256_ps128(tA));
899 gmx_mm_maskstore_ps(ptrB, mask, _mm256_castps256_ps128(tB));
900 gmx_mm_maskstore_ps(ptrC, mask, _mm256_castps256_ps128(tC));
901 gmx_mm_maskstore_ps(ptrD, mask, _mm256_castps256_ps128(tD));
902 gmx_mm_maskstore_ps(ptrE, mask, _mm256_extractf128_ps(tA, 0x1));
903 gmx_mm_maskstore_ps(ptrF, mask, _mm256_extractf128_ps(tB, 0x1));
904 gmx_mm_maskstore_ps(ptrG, mask, _mm256_extractf128_ps(tC, 0x1));
905 gmx_mm_maskstore_ps(ptrH, mask, _mm256_extractf128_ps(tD, 0x1));
910 #if defined (_MSC_VER) && defined(_M_IX86)
911 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
912 #define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
914 __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
915 __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
917 _tA = _mm256_loadu_ps(ptrA); \
918 _tB = _mm256_loadu_ps(ptrB); \
919 _tC = _mm256_loadu_ps(ptrC); \
920 _tD = _mm256_loadu_ps(ptrD); \
921 _tE = _mm256_loadu_ps(ptrE); \
922 _tF = _mm256_loadu_ps(ptrF); \
923 _tG = _mm256_loadu_ps(ptrG); \
924 _tH = _mm256_loadu_ps(ptrH); \
925 _t1 = _mm256_unpacklo_ps(_x1, _y1); \
926 _t2 = _mm256_unpackhi_ps(_x1, _y1); \
927 _t3 = _mm256_unpacklo_ps(_z1, _x2); \
928 _t4 = _mm256_unpackhi_ps(_z1, _x2); \
929 _t5 = _mm256_unpacklo_ps(_y2, _z2); \
930 _t6 = _mm256_unpackhi_ps(_y2, _z2); \
931 _t7 = _mm256_unpacklo_ps(_x3, _y3); \
932 _t8 = _mm256_unpackhi_ps(_x3, _y3); \
933 _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
934 _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
935 _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
936 _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
937 _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
938 _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
939 _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
940 _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
941 _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
942 _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
943 _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
944 _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
945 _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
946 _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
947 _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
948 _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
949 _tA = _mm256_sub_ps(_tA, _t5); \
950 _tB = _mm256_sub_ps(_tB, _t7); \
951 _tC = _mm256_sub_ps(_tC, _t1); \
952 _tD = _mm256_sub_ps(_tD, _t9); \
953 _tE = _mm256_sub_ps(_tE, _t6); \
954 _tF = _mm256_sub_ps(_tF, _t8); \
955 _tG = _mm256_sub_ps(_tG, _t2); \
956 _tH = _mm256_sub_ps(_tH, _t10); \
957 _mm256_storeu_ps(ptrA, _tA); \
958 _mm256_storeu_ps(ptrB, _tB); \
959 _mm256_storeu_ps(ptrC, _tC); \
960 _mm256_storeu_ps(ptrD, _tD); \
961 _mm256_storeu_ps(ptrE, _tE); \
962 _mm256_storeu_ps(ptrF, _tF); \
963 _mm256_storeu_ps(ptrG, _tG); \
964 _mm256_storeu_ps(ptrH, _tH); \
965 _tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8)); \
966 _tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8)); \
967 _tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8)); \
968 _tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8)); \
969 _tI = _mm256_unpacklo_ps(_tI, _tK); \
970 _tJ = _mm256_unpacklo_ps(_tJ, _tL); \
971 _tI = _mm256_unpacklo_ps(_tI, _tJ); \
972 _tI = _mm256_sub_ps(_tI, _z3); \
973 _tJ = _mm256_permute_ps(_tI, _MM_SHUFFLE(1, 1, 1, 1)); \
974 _tK = _mm256_permute_ps(_tI, _MM_SHUFFLE(2, 2, 2, 2)); \
975 _tL = _mm256_permute_ps(_tI, _MM_SHUFFLE(3, 3, 3, 3)); \
976 _mm_store_ss(ptrA+8, _mm256_castps256_ps128(_tI)); \
977 _mm_store_ss(ptrB+8, _mm256_castps256_ps128(_tJ)); \
978 _mm_store_ss(ptrC+8, _mm256_castps256_ps128(_tK)); \
979 _mm_store_ss(ptrD+8, _mm256_castps256_ps128(_tL)); \
980 _mm_store_ss(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
981 _mm_store_ss(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
982 _mm_store_ss(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
983 _mm_store_ss(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
986 /* Real function for sane compilers */
987 static gmx_inline void
988 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
989 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
990 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
991 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
992 __m256 x1, __m256 y1, __m256 z1,
993 __m256 x2, __m256 y2, __m256 z2,
994 __m256 x3, __m256 y3, __m256 z3)
996 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
997 __m256 tA, tB, tC, tD, tE, tF, tG, tH;
998 __m256 tI, tJ, tK, tL;
1000 tA = _mm256_loadu_ps(ptrA);
1001 tB = _mm256_loadu_ps(ptrB);
1002 tC = _mm256_loadu_ps(ptrC);
1003 tD = _mm256_loadu_ps(ptrD);
1004 tE = _mm256_loadu_ps(ptrE);
1005 tF = _mm256_loadu_ps(ptrF);
1006 tG = _mm256_loadu_ps(ptrG);
1007 tH = _mm256_loadu_ps(ptrH);
1009 t1 = _mm256_unpacklo_ps(x1, y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1010 t2 = _mm256_unpackhi_ps(x1, y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1011 t3 = _mm256_unpacklo_ps(z1, x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1012 t4 = _mm256_unpackhi_ps(z1, x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1014 t5 = _mm256_unpacklo_ps(y2, z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1015 t6 = _mm256_unpackhi_ps(y2, z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1016 t7 = _mm256_unpacklo_ps(x3, y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1017 t8 = _mm256_unpackhi_ps(x3, y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1019 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1020 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1021 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1022 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1024 t1 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1025 t2 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1026 t3 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1027 t4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1029 t5 = gmx_mm256_unpack128lo_ps(t9, t1); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1030 t6 = gmx_mm256_unpack128hi_ps(t9, t1); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1031 t7 = gmx_mm256_unpack128lo_ps(t10, t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1032 t8 = gmx_mm256_unpack128hi_ps(t10, t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1033 t1 = gmx_mm256_unpack128lo_ps(t11, t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1034 t2 = gmx_mm256_unpack128hi_ps(t11, t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1035 t9 = gmx_mm256_unpack128lo_ps(t12, t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1036 t10 = gmx_mm256_unpack128hi_ps(t12, t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1038 tA = _mm256_sub_ps(tA, t5);
1039 tB = _mm256_sub_ps(tB, t7);
1040 tC = _mm256_sub_ps(tC, t1);
1041 tD = _mm256_sub_ps(tD, t9);
1042 tE = _mm256_sub_ps(tE, t6);
1043 tF = _mm256_sub_ps(tF, t8);
1044 tG = _mm256_sub_ps(tG, t2);
1045 tH = _mm256_sub_ps(tH, t10);
1047 _mm256_storeu_ps(ptrA, tA);
1048 _mm256_storeu_ps(ptrB, tB);
1049 _mm256_storeu_ps(ptrC, tC);
1050 _mm256_storeu_ps(ptrD, tD);
1051 _mm256_storeu_ps(ptrE, tE);
1052 _mm256_storeu_ps(ptrF, tF);
1053 _mm256_storeu_ps(ptrG, tG);
1054 _mm256_storeu_ps(ptrH, tH);
1056 tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
1057 tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
1058 tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
1059 tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
1061 tI = _mm256_unpacklo_ps(tI, tK); /* - - zG zE | - - zC zA */
1062 tJ = _mm256_unpacklo_ps(tJ, tL); /* - - zH zF | - - zD zB */
1063 tI = _mm256_unpacklo_ps(tI, tJ); /* zH zG zF zE | zD zC zB zA */
1065 tI = _mm256_sub_ps(tI, z3);
1066 tJ = _mm256_permute_ps(tI, _MM_SHUFFLE(1, 1, 1, 1));
1067 tK = _mm256_permute_ps(tI, _MM_SHUFFLE(2, 2, 2, 2));
1068 tL = _mm256_permute_ps(tI, _MM_SHUFFLE(3, 3, 3, 3));
1070 _mm_store_ss(ptrA+8, _mm256_castps256_ps128(tI));
1071 _mm_store_ss(ptrB+8, _mm256_castps256_ps128(tJ));
1072 _mm_store_ss(ptrC+8, _mm256_castps256_ps128(tK));
1073 _mm_store_ss(ptrD+8, _mm256_castps256_ps128(tL));
1074 _mm_store_ss(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1075 _mm_store_ss(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1076 _mm_store_ss(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1077 _mm_store_ss(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1083 #if defined (_MSC_VER) && defined(_M_IX86)
1084 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1085 #define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
1086 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
1088 __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
1089 __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
1091 _tA = _mm256_loadu_ps(ptrA); \
1092 _tB = _mm256_loadu_ps(ptrB); \
1093 _tC = _mm256_loadu_ps(ptrC); \
1094 _tD = _mm256_loadu_ps(ptrD); \
1095 _tE = _mm256_loadu_ps(ptrE); \
1096 _tF = _mm256_loadu_ps(ptrF); \
1097 _tG = _mm256_loadu_ps(ptrG); \
1098 _tH = _mm256_loadu_ps(ptrH); \
1099 _t1 = _mm256_unpacklo_ps(_x1, _y1); \
1100 _t2 = _mm256_unpackhi_ps(_x1, _y1); \
1101 _t3 = _mm256_unpacklo_ps(_z1, _x2); \
1102 _t4 = _mm256_unpackhi_ps(_z1, _x2); \
1103 _t5 = _mm256_unpacklo_ps(_y2, _z2); \
1104 _t6 = _mm256_unpackhi_ps(_y2, _z2); \
1105 _t7 = _mm256_unpacklo_ps(_x3, _y3); \
1106 _t8 = _mm256_unpackhi_ps(_x3, _y3); \
1107 _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1108 _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1109 _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1110 _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1111 _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
1112 _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
1113 _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
1114 _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
1115 _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
1116 _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
1117 _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
1118 _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
1119 _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
1120 _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
1121 _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
1122 _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
1123 _tA = _mm256_sub_ps(_tA, _t5); \
1124 _tB = _mm256_sub_ps(_tB, _t7); \
1125 _tC = _mm256_sub_ps(_tC, _t1); \
1126 _tD = _mm256_sub_ps(_tD, _t9); \
1127 _tE = _mm256_sub_ps(_tE, _t6); \
1128 _tF = _mm256_sub_ps(_tF, _t8); \
1129 _tG = _mm256_sub_ps(_tG, _t2); \
1130 _tH = _mm256_sub_ps(_tH, _t10); \
1131 _mm256_storeu_ps(ptrA, _tA); \
1132 _mm256_storeu_ps(ptrB, _tB); \
1133 _mm256_storeu_ps(ptrC, _tC); \
1134 _mm256_storeu_ps(ptrD, _tD); \
1135 _mm256_storeu_ps(ptrE, _tE); \
1136 _mm256_storeu_ps(ptrF, _tF); \
1137 _mm256_storeu_ps(ptrG, _tG); \
1138 _mm256_storeu_ps(ptrH, _tH); \
1139 _tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
1140 _tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
1141 _tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
1142 _tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
1143 _t1 = _mm256_unpacklo_ps(_z3, _x4); \
1144 _t2 = _mm256_unpackhi_ps(_z3, _x4); \
1145 _t3 = _mm256_unpacklo_ps(_y4, _z4); \
1146 _t4 = _mm256_unpackhi_ps(_y4, _z4); \
1147 _t5 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1148 _t6 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1149 _t7 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1150 _t8 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1151 _tI = _mm256_sub_ps(_tI, _t5); \
1152 _tJ = _mm256_sub_ps(_tJ, _t6); \
1153 _tK = _mm256_sub_ps(_tK, _t7); \
1154 _tL = _mm256_sub_ps(_tL, _t8); \
1155 _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
1156 _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
1157 _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
1158 _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
1159 _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
1160 _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
1161 _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
1162 _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
1165 /* Real function for sane compilers */
1166 static gmx_inline void
1167 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
1168 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
1169 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
1170 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
1171 __m256 x1, __m256 y1, __m256 z1,
1172 __m256 x2, __m256 y2, __m256 z2,
1173 __m256 x3, __m256 y3, __m256 z3,
1174 __m256 x4, __m256 y4, __m256 z4)
1176 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
1177 __m256 tA, tB, tC, tD, tE, tF, tG, tH;
1178 __m256 tI, tJ, tK, tL;
1180 tA = _mm256_loadu_ps(ptrA);
1181 tB = _mm256_loadu_ps(ptrB);
1182 tC = _mm256_loadu_ps(ptrC);
1183 tD = _mm256_loadu_ps(ptrD);
1184 tE = _mm256_loadu_ps(ptrE);
1185 tF = _mm256_loadu_ps(ptrF);
1186 tG = _mm256_loadu_ps(ptrG);
1187 tH = _mm256_loadu_ps(ptrH);
1189 t1 = _mm256_unpacklo_ps(x1, y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1190 t2 = _mm256_unpackhi_ps(x1, y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1191 t3 = _mm256_unpacklo_ps(z1, x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1192 t4 = _mm256_unpackhi_ps(z1, x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1194 t5 = _mm256_unpacklo_ps(y2, z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1195 t6 = _mm256_unpackhi_ps(y2, z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1196 t7 = _mm256_unpacklo_ps(x3, y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1197 t8 = _mm256_unpackhi_ps(x3, y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1199 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1200 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1201 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1202 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1204 t1 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1205 t2 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1206 t3 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1207 t4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1209 t5 = gmx_mm256_unpack128lo_ps(t9, t1); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1210 t6 = gmx_mm256_unpack128hi_ps(t9, t1); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1211 t7 = gmx_mm256_unpack128lo_ps(t10, t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1212 t8 = gmx_mm256_unpack128hi_ps(t10, t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1213 t1 = gmx_mm256_unpack128lo_ps(t11, t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1214 t2 = gmx_mm256_unpack128hi_ps(t11, t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1215 t9 = gmx_mm256_unpack128lo_ps(t12, t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1216 t10 = gmx_mm256_unpack128hi_ps(t12, t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1218 tA = _mm256_sub_ps(tA, t5);
1219 tB = _mm256_sub_ps(tB, t7);
1220 tC = _mm256_sub_ps(tC, t1);
1221 tD = _mm256_sub_ps(tD, t9);
1222 tE = _mm256_sub_ps(tE, t6);
1223 tF = _mm256_sub_ps(tF, t8);
1224 tG = _mm256_sub_ps(tG, t2);
1225 tH = _mm256_sub_ps(tH, t10);
1227 _mm256_storeu_ps(ptrA, tA);
1228 _mm256_storeu_ps(ptrB, tB);
1229 _mm256_storeu_ps(ptrC, tC);
1230 _mm256_storeu_ps(ptrD, tD);
1231 _mm256_storeu_ps(ptrE, tE);
1232 _mm256_storeu_ps(ptrF, tF);
1233 _mm256_storeu_ps(ptrG, tG);
1234 _mm256_storeu_ps(ptrH, tH);
1236 tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8));
1237 tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8));
1238 tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8));
1239 tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8));
1241 t1 = _mm256_unpacklo_ps(z3, x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
1242 t2 = _mm256_unpackhi_ps(z3, x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
1243 t3 = _mm256_unpacklo_ps(y4, z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
1244 t4 = _mm256_unpackhi_ps(y4, z4); /* z4h y4h z4g y4g | z4d y4d z4c y4c */
1246 t5 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
1247 t6 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
1248 t7 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
1249 t8 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
1251 tI = _mm256_sub_ps(tI, t5);
1252 tJ = _mm256_sub_ps(tJ, t6);
1253 tK = _mm256_sub_ps(tK, t7);
1254 tL = _mm256_sub_ps(tL, t8);
1256 _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(tI));
1257 _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(tJ));
1258 _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(tK));
1259 _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(tL));
1260 _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1261 _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1262 _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1263 _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1268 static gmx_inline void
1269 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1270 float * gmx_restrict fptr,
1271 float * gmx_restrict fshiftptr)
1275 fix1 = _mm256_hadd_ps(fix1, fix1);
1276 fiy1 = _mm256_hadd_ps(fiy1, fiz1);
1277 fix1 = _mm256_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 (in both lanes) */
1279 /* Add across the two lanes */
1280 t1 = _mm_add_ps(_mm256_castps256_ps128(fix1), _mm256_extractf128_ps(fix1, 0x1));
1282 t2 = _mm_load_ss(fptr);
1283 t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
1284 t3 = _mm_load_ss(fshiftptr);
1285 t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
1287 t2 = _mm_add_ps(t2, t1);
1288 t3 = _mm_add_ps(t3, t1);
1290 _mm_store_ss(fptr, t2);
1291 _mm_storeh_pi((__m64 *)(fptr+1), t2);
1292 _mm_store_ss(fshiftptr, t3);
1293 _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
1296 #if defined (_MSC_VER) && defined(_M_IX86)
1297 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1298 #define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
1301 __m256 _t1, _t2, _t3; \
1302 __m128 _tA, _tB, _tC; \
1304 fix1 = _mm256_hadd_ps(fix1, fiy1); \
1305 fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1306 fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1307 fix3 = _mm256_hadd_ps(fix3, fiy3); \
1308 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1309 fix1 = _mm256_hadd_ps(fix1, fiz1); \
1310 fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1311 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1313 _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1314 _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1315 _t1 = _mm256_add_ps(_t1, _t2); \
1316 _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1317 _t3 = _mm256_loadu_ps(fptr); \
1318 _t3 = _mm256_add_ps(_t3, _t1); \
1319 _mm256_storeu_ps(fptr, _t3); \
1320 _tB = _mm_load_ss(fptr+8); \
1321 _tB = _mm_add_ss(_tB, _tA); \
1322 _mm_store_ss(fptr+8, _tB); \
1324 _tB = _mm256_extractf128_ps(_t1, 0x1); \
1325 _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1326 _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1327 _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1328 _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1329 _tA = _mm_add_ps(_tB, _tC); \
1330 _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1331 _tC = _mm_loadu_ps(fshiftptr); \
1332 _tC = _mm_add_ps(_tC, _tA); \
1333 _mm_storeu_ps(fshiftptr, _tC); \
1336 /* Real function for sane compilers */
1337 static gmx_inline void
1338 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1339 __m256 fix2, __m256 fiy2, __m256 fiz2,
1340 __m256 fix3, __m256 fiy3, __m256 fiz3,
1341 float * gmx_restrict fptr,
1342 float * gmx_restrict fshiftptr)
1347 fix1 = _mm256_hadd_ps(fix1, fiy1); /* Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1348 fiz1 = _mm256_hadd_ps(fiz1, fix2); /* X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1349 fiy2 = _mm256_hadd_ps(fiy2, fiz2); /* Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1350 fix3 = _mm256_hadd_ps(fix3, fiy3); /* Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1351 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /* 0 0 Z3g+Z3h Z3e+Z3f | 0 0 Z3c+Z3d Z3a+Z3b */
1353 fix1 = _mm256_hadd_ps(fix1, fiz1); /* X2e-h Z1e-h Y1e-h X1e-h | X2a-d Z1a-d Y1a-d X1a-d */
1354 fiy2 = _mm256_hadd_ps(fiy2, fix3); /* Y3e-h X3e-h Z2e-h Y2e-h | Y3a-d X3a-d Z2a-d Y2a-d */
1355 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /* 0 0 0 Z3e-h | 0 0 0 Z3a-d */
1357 /* Add across the two lanes by swapping and adding back */
1358 t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); /* Y3a-d X3a-d Z2a-d Y2a-d | X2a-d Z1a-d Y1a-d X1a-d */
1359 t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); /* Y3e-h X3e-h Z2e-h Y2e-h | X2e-h Z1e-h Y1e-h X1e-h */
1360 t1 = _mm256_add_ps(t1, t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1362 tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* 0 0 0 z3 */
1364 t3 = _mm256_loadu_ps(fptr);
1365 t3 = _mm256_add_ps(t3, t1);
1366 _mm256_storeu_ps(fptr, t3);
1367 tB = _mm_load_ss(fptr+8);
1368 tB = _mm_add_ss(tB, tA);
1369 _mm_store_ss(fptr+8, tB);
1371 /* Add up shift force */
1372 tB = _mm256_extractf128_ps(t1, 0x1); /* y3 x3 z2 y2 */
1373 tC = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1374 tB = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2)); /* 0 z3 y3 x3 */
1375 tC = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0)); /* - z2 y2 x2 */
1377 tB = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1378 tA = _mm_add_ps(tB, tC); /* - z y x */
1380 tA = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1382 tC = _mm_loadu_ps(fshiftptr);
1383 tC = _mm_add_ps(tC, tA);
1384 _mm_storeu_ps(fshiftptr, tC);
1389 #if defined (_MSC_VER) && defined(_M_IX86)
1390 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1391 #define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1394 __m256 _t1, _t2, _t3; \
1395 __m128 _tA, _tB, _tC; \
1397 fix1 = _mm256_hadd_ps(fix1, fiy1); \
1398 fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1399 fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1400 fix3 = _mm256_hadd_ps(fix3, fiy3); \
1401 fiz3 = _mm256_hadd_ps(fiz3, fix4); \
1402 fiy4 = _mm256_hadd_ps(fiy4, fiz4); \
1404 fix1 = _mm256_hadd_ps(fix1, fiz1); \
1405 fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1406 fiz3 = _mm256_hadd_ps(fiz3, fiy4); \
1408 _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1409 _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1410 _t1 = _mm256_add_ps(_t1, _t2); \
1411 _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1412 _t3 = _mm256_loadu_ps(fptr); \
1413 _t3 = _mm256_add_ps(_t3, _t1); \
1414 _mm256_storeu_ps(fptr, _t3); \
1415 _tB = _mm_loadu_ps(fptr+8); \
1416 _tB = _mm_add_ps(_tB, _tA); \
1417 _mm_storeu_ps(fptr+8, _tB); \
1419 _tB = _mm256_extractf128_ps(_t1, 0x1); \
1420 _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1421 _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1422 _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1423 _tA = _mm_permute_ps(_tA, _MM_SHUFFLE(0, 3, 2, 1)); \
1424 _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1425 _tA = _mm_add_ps(_tA, _tC); \
1426 _tA = _mm_add_ps(_tA, _tB); \
1427 _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1428 _tC = _mm_loadu_ps(fshiftptr); \
1429 _tC = _mm_add_ps(_tC, _tA); \
1430 _mm_storeu_ps(fshiftptr, _tC); \
1433 /* Real function for sane compilers */
1434 static gmx_inline void
1435 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1436 __m256 fix2, __m256 fiy2, __m256 fiz2,
1437 __m256 fix3, __m256 fiy3, __m256 fiz3,
1438 __m256 fix4, __m256 fiy4, __m256 fiz4,
1439 float * gmx_restrict fptr,
1440 float * gmx_restrict fshiftptr)
1445 fix1 = _mm256_hadd_ps(fix1, fiy1); /* Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1446 fiz1 = _mm256_hadd_ps(fiz1, fix2); /* X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1447 fiy2 = _mm256_hadd_ps(fiy2, fiz2); /* Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1448 fix3 = _mm256_hadd_ps(fix3, fiy3); /* Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1449 fiz3 = _mm256_hadd_ps(fiz3, fix4); /* X4g+X4h X4e+X4f Z3g+Z3h Z3e+Z3f | X4c+X4d X4a+X4b Z3c+Z3d Z3a+Z3b */
1450 fiy4 = _mm256_hadd_ps(fiy4, fiz4); /* Z4g+Z4h Z4e+Z4f Y4g+Y4h Y4e+Y4f | Z4c+Z4d Z4a+Z4b Y4c+Y4d Y4a+Y4b */
1452 fix1 = _mm256_hadd_ps(fix1, fiz1); /* X2e-h Z1e-h Y1e-h X1e-h | X2a-d Z1a-d Y1a-d X1a-d */
1453 fiy2 = _mm256_hadd_ps(fiy2, fix3); /* Y3e-h X3e-h Z2e-h Y2e-h | Y3a-d X3a-d Z2a-d Y2a-d */
1454 fiz3 = _mm256_hadd_ps(fiz3, fiy4); /* Z4e-h Y4e-h X4e-h Z3e-h | Z4a-d Y4a-d X4a-d Z3a-d */
1456 /* Add across the two lanes by swapping and adding back */
1457 t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); /* Y3a-d X3a-d Z2a-d Y2a-d | X2a-d Z1a-d Y1a-d X1a-d */
1458 t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); /* Y3e-h X3e-h Z2e-h Y2e-h | X2e-h Z1e-h Y1e-h X1e-h */
1459 t1 = _mm256_add_ps(t1, t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1461 tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* z4 y4 x4 z3 */
1463 t3 = _mm256_loadu_ps(fptr);
1464 t3 = _mm256_add_ps(t3, t1);
1465 _mm256_storeu_ps(fptr, t3);
1467 tB = _mm_loadu_ps(fptr+8);
1468 tB = _mm_add_ps(tB, tA);
1469 _mm_storeu_ps(fptr+8, tB);
1471 /* Add up shift force */
1472 tB = _mm256_extractf128_ps(t1, 0x1); /* y3 x3 z2 y2 */
1473 tC = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1474 tB = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2)); /* 0 z3 y3 x3 */
1475 tC = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0)); /* - z2 y2 x2 */
1476 tA = _mm_permute_ps(tA, _MM_SHUFFLE(0, 3, 2, 1)); /* - z4 y4 x4 */
1478 tB = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1479 tA = _mm_add_ps(tA, tC);
1480 tA = _mm_add_ps(tA, tB);
1482 tA = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1484 tC = _mm_loadu_ps(fshiftptr);
1485 tC = _mm_add_ps(tC, tA);
1486 _mm_storeu_ps(fshiftptr, tC);
1492 static gmx_inline void
1493 gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
1497 pot1 = _mm256_hadd_ps(pot1, pot1);
1498 pot1 = _mm256_hadd_ps(pot1, pot1);
1500 t1 = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1502 _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1505 static gmx_inline void
1506 gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1507 __m256 pot2, float * gmx_restrict ptrB)
1511 pot1 = _mm256_hadd_ps(pot1, pot2);
1512 pot1 = _mm256_hadd_ps(pot1, pot1);
1514 t1 = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1516 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
1517 _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1518 _mm_store_ss(ptrB, _mm_add_ss(_mm_load_ss(ptrB), t2));
1522 #endif /* _kernelutil_x86_avx_256_single_h_ */