2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_x86_avx_256_single_h_
30 #define _kernelutil_x86_avx_256_single_h_
32 #include "gmx_x86_avx_256.h"
34 /* Transpose lower/upper half of 256-bit registers separately */
35 #define GMX_MM256_HALFTRANSPOSE4_PS(ymm0, ymm1, ymm2, ymm3) { \
36 __m256 __tmp0, __tmp1, __tmp2, __tmp3; \
38 __tmp0 = _mm256_unpacklo_ps((ymm0), (ymm1)); \
39 __tmp1 = _mm256_unpacklo_ps((ymm2), (ymm3)); \
40 __tmp2 = _mm256_unpackhi_ps((ymm0), (ymm1)); \
41 __tmp3 = _mm256_unpackhi_ps((ymm2), (ymm3)); \
42 ymm0 = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(1, 0, 1, 0)); \
43 ymm1 = _mm256_shuffle_ps(__tmp0, __tmp1, _MM_SHUFFLE(3, 2, 3, 2)); \
44 ymm2 = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(1, 0, 1, 0)); \
45 ymm3 = _mm256_shuffle_ps(__tmp2, __tmp3, _MM_SHUFFLE(3, 2, 3, 2)); \
49 static gmx_inline __m256
50 gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
52 return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
55 /* Normal sum of four ymm registers */
56 #define gmx_mm256_sum4_ps(t0, t1, t2, t3) _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
60 gmx_mm256_any_lt(__m256 a, __m256 b)
62 return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_LT_OQ));
66 static gmx_inline __m256
67 gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
68 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
72 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
73 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
74 return _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t2));
78 static gmx_inline __m256
79 gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
80 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
81 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
82 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH)
86 t1 = gmx_mm256_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
87 t2 = gmx_mm256_load_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH);
89 return _mm256_permute2f128_ps(t1, t2, 0x20);
94 static gmx_inline void
95 gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
96 float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
100 t2 = _mm256_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
101 t3 = _mm256_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
102 t4 = _mm256_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
103 _mm_store_ss(ptrA, _mm256_castps256_ps128(xmm1));
104 _mm_store_ss(ptrB, _mm256_castps256_ps128(t2));
105 _mm_store_ss(ptrC, _mm256_castps256_ps128(t3));
106 _mm_store_ss(ptrD, _mm256_castps256_ps128(t4));
110 static gmx_inline void
111 gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
112 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
113 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
114 float * gmx_restrict ptrG, float * gmx_restrict ptrH, __m256 xmm1)
118 t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
120 gmx_mm256_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
121 gmx_mm256_store_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
125 static gmx_inline void
126 gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
127 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
130 __m128 t1, t2, t3, t4;
132 t1 = _mm256_castps256_ps128(xmm1);
133 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
134 t3 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
135 t4 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
137 t1 = _mm_add_ss(t1, _mm_load_ss(ptrA));
138 t2 = _mm_add_ss(t2, _mm_load_ss(ptrB));
139 t3 = _mm_add_ss(t3, _mm_load_ss(ptrC));
140 t4 = _mm_add_ss(t4, _mm_load_ss(ptrD));
142 _mm_store_ss(ptrA, t1);
143 _mm_store_ss(ptrB, t2);
144 _mm_store_ss(ptrC, t3);
145 _mm_store_ss(ptrD, t4);
148 static gmx_inline void
149 gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
150 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
151 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
152 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
157 t1 = _mm256_permute2f128_ps(xmm1, xmm1, 0x11);
159 gmx_mm256_increment_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, xmm1);
160 gmx_mm256_increment_4real_swizzle_ps(ptrE, ptrF, ptrG, ptrH, t1);
164 static gmx_inline void
165 gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
166 const float * gmx_restrict p3, const float * gmx_restrict p4,
167 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
169 __m128 t1, t2, t3, t4;
171 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1); /* - - c12a c6a */
172 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2); /* - - c12b c6b */
173 t3 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3); /* - - c12c c6c */
174 t4 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4); /* - - c12d c6d */
176 t1 = _mm_unpacklo_ps(t1, t2); /* c12b c12a c6b c6a */
177 t3 = _mm_unpacklo_ps(t3, t4); /* c12d c12c c6d c6c */
179 *c6 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)));
180 *c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)));
183 static gmx_inline void
184 gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
185 const float * gmx_restrict p3, const float * gmx_restrict p4,
186 const float * gmx_restrict p5, const float * gmx_restrict p6,
187 const float * gmx_restrict p7, const float * gmx_restrict p8,
188 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
190 __m256 c6l, c6h, c12l, c12h;
192 gmx_mm256_load_4pair_swizzle_ps(p1, p2, p3, p4, &c6l, &c12l);
193 gmx_mm256_load_4pair_swizzle_ps(p5, p6, p7, p8, &c6h, &c12h);
195 *c6 = _mm256_permute2f128_ps(c6l, c6h, 0x20);
196 *c12 = _mm256_permute2f128_ps(c12l, c12h, 0x20);
200 static gmx_inline void
201 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
202 const float * gmx_restrict xyz,
203 __m256 * gmx_restrict x1,
204 __m256 * gmx_restrict y1,
205 __m256 * gmx_restrict z1)
207 __m128 t1, t2, t3, t4;
209 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
210 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
211 t3 = _mm_load_ss(xyz_shift+2);
212 t4 = _mm_load_ss(xyz+2);
213 t1 = _mm_add_ps(t1, t2);
214 t3 = _mm_add_ss(t3, t4);
216 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
217 t1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
218 t3 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
220 *x1 = gmx_mm256_set_m128(t1, t1);
221 *y1 = gmx_mm256_set_m128(t2, t2);
222 *z1 = gmx_mm256_set_m128(t3, t3);
226 static gmx_inline void
227 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
228 const float * gmx_restrict xyz,
229 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
230 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
231 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
234 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9;
236 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
237 tB = _mm_load_ss(xyz_shift+2);
239 t1 = _mm_loadu_ps(xyz);
240 t2 = _mm_loadu_ps(xyz+4);
241 t3 = _mm_load_ss(xyz+8);
243 tA = _mm_movelh_ps(tA, tB);
244 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
245 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
246 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
248 t1 = _mm_add_ps(t1, t4);
249 t2 = _mm_add_ps(t2, t5);
250 t3 = _mm_add_ss(t3, t6);
252 t9 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
253 t8 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
254 t7 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
255 t6 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
256 t5 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
257 t4 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
258 t3 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
259 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
260 t1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
262 *x1 = gmx_mm256_set_m128(t1, t1);
263 *y1 = gmx_mm256_set_m128(t2, t2);
264 *z1 = gmx_mm256_set_m128(t3, t3);
265 *x2 = gmx_mm256_set_m128(t4, t4);
266 *y2 = gmx_mm256_set_m128(t5, t5);
267 *z2 = gmx_mm256_set_m128(t6, t6);
268 *x3 = gmx_mm256_set_m128(t7, t7);
269 *y3 = gmx_mm256_set_m128(t8, t8);
270 *z3 = gmx_mm256_set_m128(t9, t9);
274 static gmx_inline void
275 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
276 const float * gmx_restrict xyz,
277 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
278 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
279 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
280 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
283 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
285 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
286 tB = _mm_load_ss(xyz_shift+2);
288 t1 = _mm_loadu_ps(xyz);
289 t2 = _mm_loadu_ps(xyz+4);
290 t3 = _mm_loadu_ps(xyz+8);
292 tA = _mm_movelh_ps(tA, tB);
293 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
294 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
295 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
297 t1 = _mm_add_ps(t1, t4);
298 t2 = _mm_add_ps(t2, t5);
299 t3 = _mm_add_ps(t3, t6);
301 t12 = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
302 t11 = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
303 t10 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
304 t9 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
305 t8 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
306 t7 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
307 t6 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
308 t5 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
309 t4 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
310 t3 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
311 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
312 t1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
314 *x1 = gmx_mm256_set_m128(t1, t1);
315 *y1 = gmx_mm256_set_m128(t2, t2);
316 *z1 = gmx_mm256_set_m128(t3, t3);
317 *x2 = gmx_mm256_set_m128(t4, t4);
318 *y2 = gmx_mm256_set_m128(t5, t5);
319 *z2 = gmx_mm256_set_m128(t6, t6);
320 *x3 = gmx_mm256_set_m128(t7, t7);
321 *y3 = gmx_mm256_set_m128(t8, t8);
322 *z3 = gmx_mm256_set_m128(t9, t9);
323 *x4 = gmx_mm256_set_m128(t10, t10);
324 *y4 = gmx_mm256_set_m128(t11, t11);
325 *z4 = gmx_mm256_set_m128(t12, t12);
330 static gmx_inline void
331 gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
332 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
333 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
335 __m128 t1, t2, t3, t4;
336 __m128i mask = _mm_set_epi32(0, -1, -1, -1);
337 t1 = gmx_mm_maskload_ps(ptrA, mask);
338 t2 = gmx_mm_maskload_ps(ptrB, mask);
339 t3 = gmx_mm_maskload_ps(ptrC, mask);
340 t4 = gmx_mm_maskload_ps(ptrD, mask);
341 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
342 *x1 = _mm256_castps128_ps256(t1);
343 *y1 = _mm256_castps128_ps256(t2);
344 *z1 = _mm256_castps128_ps256(t3);
348 static gmx_inline void
349 gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
350 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
351 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
352 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
353 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
355 __m128 t1, t2, t3, t4;
356 t1 = _mm_loadu_ps(ptrA);
357 t2 = _mm_loadu_ps(ptrB);
358 t3 = _mm_loadu_ps(ptrC);
359 t4 = _mm_loadu_ps(ptrD);
360 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
361 *x1 = _mm256_castps128_ps256(t1);
362 *y1 = _mm256_castps128_ps256(t2);
363 *z1 = _mm256_castps128_ps256(t3);
364 *x2 = _mm256_castps128_ps256(t4);
365 t1 = _mm_loadu_ps(ptrA+4);
366 t2 = _mm_loadu_ps(ptrB+4);
367 t3 = _mm_loadu_ps(ptrC+4);
368 t4 = _mm_loadu_ps(ptrD+4);
369 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
370 *y2 = _mm256_castps128_ps256(t1);
371 *z2 = _mm256_castps128_ps256(t2);
372 *x3 = _mm256_castps128_ps256(t3);
373 *y3 = _mm256_castps128_ps256(t4);
374 t1 = _mm_load_ss(ptrA+8);
375 t2 = _mm_load_ss(ptrB+8);
376 t3 = _mm_load_ss(ptrC+8);
377 t4 = _mm_load_ss(ptrD+8);
378 t1 = _mm_unpacklo_ps(t1, t3);
379 t3 = _mm_unpacklo_ps(t2, t4);
380 *z3 = _mm256_castps128_ps256(_mm_unpacklo_ps(t1, t3));
385 static gmx_inline void
386 gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
387 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
388 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
389 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
390 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
391 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
393 __m128 t1, t2, t3, t4;
394 t1 = _mm_loadu_ps(ptrA);
395 t2 = _mm_loadu_ps(ptrB);
396 t3 = _mm_loadu_ps(ptrC);
397 t4 = _mm_loadu_ps(ptrD);
398 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
399 *x1 = _mm256_castps128_ps256(t1);
400 *y1 = _mm256_castps128_ps256(t2);
401 *z1 = _mm256_castps128_ps256(t3);
402 *x2 = _mm256_castps128_ps256(t4);
403 t1 = _mm_loadu_ps(ptrA+4);
404 t2 = _mm_loadu_ps(ptrB+4);
405 t3 = _mm_loadu_ps(ptrC+4);
406 t4 = _mm_loadu_ps(ptrD+4);
407 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
408 *y2 = _mm256_castps128_ps256(t1);
409 *z2 = _mm256_castps128_ps256(t2);
410 *x3 = _mm256_castps128_ps256(t3);
411 *y3 = _mm256_castps128_ps256(t4);
412 t1 = _mm_loadu_ps(ptrA+8);
413 t2 = _mm_loadu_ps(ptrB+8);
414 t3 = _mm_loadu_ps(ptrC+8);
415 t4 = _mm_loadu_ps(ptrD+8);
416 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
417 *z3 = _mm256_castps128_ps256(t1);
418 *x4 = _mm256_castps128_ps256(t2);
419 *y4 = _mm256_castps128_ps256(t3);
420 *z4 = _mm256_castps128_ps256(t4);
424 static gmx_inline void
425 gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
426 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
427 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
428 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
429 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
431 __m256 t1, t2, t3, t4, t5, t6, t7, t8;
432 __m128i mask = _mm_set_epi32(0, -1, -1, -1);
434 t1 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask)); /* - zE yE xE | - zA yA xA */
435 t2 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask)); /* - zF yF xF | - zB yB xB */
436 t3 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask)); /* - zG yG xG | - zC yC xC */
437 t4 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask)); /* - zH yH xH | - zD yD xD */
439 t5 = _mm256_unpacklo_ps(t1, t2); /* yF yE xF xE | yB yA xB xA */
440 t6 = _mm256_unpacklo_ps(t3, t4); /* yH yG xH xG | yD yC xD xC */
441 t7 = _mm256_unpackhi_ps(t1, t2); /* - - zF zE | - - zB zA */
442 t8 = _mm256_unpackhi_ps(t3, t4); /* - - zH zG | - - zD zC */
444 *x1 = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(1, 0, 1, 0));
445 *y1 = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(3, 2, 3, 2));
446 *z1 = _mm256_shuffle_ps(t7, t8, _MM_SHUFFLE(1, 0, 1, 0));
450 static gmx_inline void
451 gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
452 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
453 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
454 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
455 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
456 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
457 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
459 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
461 t1 = _mm256_loadu_ps(ptrA); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
462 t2 = _mm256_loadu_ps(ptrB); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
463 t3 = _mm256_loadu_ps(ptrC); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
464 t4 = _mm256_loadu_ps(ptrD); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
465 t5 = _mm256_loadu_ps(ptrE); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
466 t6 = _mm256_loadu_ps(ptrF); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
467 t7 = _mm256_loadu_ps(ptrG); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
468 t8 = _mm256_loadu_ps(ptrH); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
470 t9 = _mm256_unpacklo_ps(t1, t2); /* z2b z2a y2b y2a | y1b y1a x1b x1a */
471 t10 = _mm256_unpackhi_ps(t1, t2); /* y3b y3a x3b x3a | x2b x2a z1b z1a */
472 t11 = _mm256_unpacklo_ps(t3, t4); /* z2d z2c y2d y2c | y1d y1c x1d x1c */
473 t12 = _mm256_unpackhi_ps(t3, t4); /* y3d y3c x3d x3c | x2d x2c z1d z1c */
474 t1 = _mm256_unpacklo_ps(t5, t6); /* z2f z2e y2f y2e | y1f y1e x1f x1e */
475 t2 = _mm256_unpackhi_ps(t5, t6); /* y3f y3e x3f x3e | x2f x2e z1f z1e */
476 t3 = _mm256_unpacklo_ps(t7, t8); /* z2h z2g y2h y2g | y1h y1g x1h x1g */
477 t4 = _mm256_unpackhi_ps(t7, t8); /* y3h y3g x3h x3g | x2h x2g z1h z1g */
479 t5 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0)); /* y2d y2c y2b y2a | x1d x1c x1b x1a */
480 t6 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2)); /* z2d z2c z2b z2a | y1d y1c y1b y1a */
481 t7 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
482 t8 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
484 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* y2h y2g y2f y2e | x1h x1g x1f x1e */
485 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z2h z2g z2f z2e | y1h y1g y1f y1e */
486 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x3h x3g x3f x3e | z1h z1g z1f z1e */
487 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h y3g y3f y3e | x2h x2g x2f x2e */
489 *x1 = _mm256_permute2f128_ps(t5, t9, 0x20);
490 *y1 = _mm256_permute2f128_ps(t6, t10, 0x20);
491 *z1 = _mm256_permute2f128_ps(t7, t11, 0x20);
492 *x2 = _mm256_permute2f128_ps(t8, t12, 0x20);
494 *y2 = _mm256_permute2f128_ps(t5, t9, 0x31);
495 *z2 = _mm256_permute2f128_ps(t6, t10, 0x31);
496 *x3 = _mm256_permute2f128_ps(t7, t11, 0x31);
497 *y3 = _mm256_permute2f128_ps(t8, t12, 0x31);
499 t1 = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
500 t2 = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
501 t3 = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
502 t4 = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
504 t1 = _mm256_unpacklo_ps(t1, t3); /* - - z3g z3e | - - z3c z3a */
505 t2 = _mm256_unpacklo_ps(t2, t4); /* - - z3h z3f | - - z3d z3b */
507 *z3 = _mm256_unpacklo_ps(t1, t2);
512 static gmx_inline void
513 gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
514 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
515 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
516 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
517 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
518 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
519 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
520 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
522 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
524 t1 = _mm256_loadu_ps(ptrA); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
525 t2 = _mm256_loadu_ps(ptrB); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
526 t3 = _mm256_loadu_ps(ptrC); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
527 t4 = _mm256_loadu_ps(ptrD); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
528 t5 = _mm256_loadu_ps(ptrE); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
529 t6 = _mm256_loadu_ps(ptrF); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
530 t7 = _mm256_loadu_ps(ptrG); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
531 t8 = _mm256_loadu_ps(ptrH); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
533 t9 = _mm256_unpacklo_ps(t1, t2); /* z2b z2a y2b y2a | y1b y1a x1b x1a */
534 t10 = _mm256_unpackhi_ps(t1, t2); /* y3b y3a x3b x3a | x2b x2a z1b z1a */
535 t11 = _mm256_unpacklo_ps(t3, t4); /* z2d z2c y2d y2c | y1d y1c x1d x1c */
536 t12 = _mm256_unpackhi_ps(t3, t4); /* y3d y3c x3d x3c | x2d x2c z1d z1c */
537 t1 = _mm256_unpacklo_ps(t5, t6); /* z2f z2e y2f y2e | y1f y1e x1f x1e */
538 t2 = _mm256_unpackhi_ps(t5, t6); /* y3f y3e x3f x3e | x2f x2e z1f z1e */
539 t3 = _mm256_unpacklo_ps(t7, t8); /* z2h z2g y2h y2g | y1h y1g x1h x1g */
540 t4 = _mm256_unpackhi_ps(t7, t8); /* y3h y3g x3h x3g | x2h x2g z1h z1g */
542 t5 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(1, 0, 1, 0)); /* y2d y2c y2b y2a | x1d x1c x1b x1a */
543 t6 = _mm256_shuffle_ps(t9, t11, _MM_SHUFFLE(3, 2, 3, 2)); /* z2d z2c z2b z2a | y1d y1c y1b y1a */
544 t7 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(1, 0, 1, 0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
545 t8 = _mm256_shuffle_ps(t10, t12, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
546 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* y2h y2g y2f y2e | x1h x1g x1f x1e */
547 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z2h z2g z2f z2e | y1h y1g y1f y1e */
548 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x3h x3g x3f x3e | z1h z1g z1f z1e */
549 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h y3g y3f y3e | x2h x2g x2f x2e */
551 *x1 = _mm256_permute2f128_ps(t5, t9, 0x20);
552 *y1 = _mm256_permute2f128_ps(t6, t10, 0x20);
553 *z1 = _mm256_permute2f128_ps(t7, t11, 0x20);
554 *x2 = _mm256_permute2f128_ps(t8, t12, 0x20);
556 *y2 = _mm256_permute2f128_ps(t5, t9, 0x31);
557 *z2 = _mm256_permute2f128_ps(t6, t10, 0x31);
558 *x3 = _mm256_permute2f128_ps(t7, t11, 0x31);
559 *y3 = _mm256_permute2f128_ps(t8, t12, 0x31);
561 t1 = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
562 t2 = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
563 t3 = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
564 t4 = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
566 t5 = _mm256_unpacklo_ps(t1, t2); /* x4f x4e z3f z3e | x4b x4a z3b z3a */
567 t6 = _mm256_unpackhi_ps(t1, t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
568 t7 = _mm256_unpacklo_ps(t3, t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
569 t8 = _mm256_unpackhi_ps(t3, t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
571 *z3 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
572 *x4 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
573 *y4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
574 *z4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* z4h z4g z4f z4e | z4d z4c z4b z4a */
578 static gmx_inline void
579 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
580 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
581 __m256 x1, __m256 y1, __m256 z1)
583 __m128 t1, t2, t3, t4, t5, t6, t7, t8;
586 /* Construct a mask without executing any data loads */
587 mask = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
589 t3 = _mm_unpacklo_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1b x1b y1a x1a */
590 t4 = _mm_unpackhi_ps(_mm256_castps256_ps128(x1), _mm256_castps256_ps128(y1)); /* y1d x1d y1c x1c */
592 t1 = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 0, 1, 0)); /* - z1a y1a x1a */
593 t2 = _mm_shuffle_ps(t3, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 1, 3, 2)); /* - z1b y1b x1b */
594 t3 = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 2, 1, 0)); /* - z1c y1c x1c */
595 t4 = _mm_shuffle_ps(t4, _mm256_castps256_ps128(z1), _MM_SHUFFLE(0, 3, 3, 2)); /* - z1d y1d x1d */
597 t5 = gmx_mm_maskload_ps(ptrA, mask);
598 t6 = gmx_mm_maskload_ps(ptrB, mask);
599 t7 = gmx_mm_maskload_ps(ptrC, mask);
600 t8 = gmx_mm_maskload_ps(ptrD, mask);
602 t5 = _mm_sub_ps(t5, t1);
603 t6 = _mm_sub_ps(t6, t2);
604 t7 = _mm_sub_ps(t7, t3);
605 t8 = _mm_sub_ps(t8, t4);
607 gmx_mm_maskstore_ps(ptrA, mask, t5);
608 gmx_mm_maskstore_ps(ptrB, mask, t6);
609 gmx_mm_maskstore_ps(ptrC, mask, t7);
610 gmx_mm_maskstore_ps(ptrD, mask, t8);
613 #if defined (_MSC_VER) && defined(_M_IX86)
614 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
615 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
616 x1, y1, z1, x2, y2, z2, x3, y3, z3) \
618 __m256 _t1, _t2, _t3, _t4, _t5, _t6; \
619 __m128 _tA, _tB, _tC, _tD; \
621 _t1 = _mm256_loadu_ps(ptrA); \
622 _t2 = _mm256_loadu_ps(ptrB); \
623 _t3 = _mm256_loadu_ps(ptrC); \
624 _t4 = _mm256_loadu_ps(ptrD); \
625 _tA = _mm_load_ss(ptrA+8); \
626 _tB = _mm_load_ss(ptrB+8); \
627 _tC = _mm_load_ss(ptrC+8); \
628 _tD = _mm_load_ss(ptrD+8); \
629 _t5 = _mm256_unpacklo_ps(x1, y1); \
630 x1 = _mm256_unpackhi_ps(x1, y1); \
631 y1 = _mm256_unpacklo_ps(z1, x2); \
632 z1 = _mm256_unpackhi_ps(z1, x2); \
633 x2 = _mm256_unpacklo_ps(y2, z2); \
634 y2 = _mm256_unpackhi_ps(y2, z2); \
635 _t6 = _mm256_unpacklo_ps(x3, y3); \
636 x3 = _mm256_unpackhi_ps(x3, y3); \
637 _t5 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
638 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
639 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1); \
640 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
641 z2 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
642 _t5 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
643 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
644 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
645 _t1 = _mm256_sub_ps(_t1, z2); \
646 _t2 = _mm256_sub_ps(_t2, _t5); \
647 _t3 = _mm256_sub_ps(_t3, y1); \
648 _t4 = _mm256_sub_ps(_t4, x1); \
649 _tA = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3)); \
650 _tB = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1))); \
651 _tC = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2))); \
652 _tD = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3))); \
653 _mm256_storeu_ps(ptrA, _t1); \
654 _mm256_storeu_ps(ptrB, _t2); \
655 _mm256_storeu_ps(ptrC, _t3); \
656 _mm256_storeu_ps(ptrD, _t4); \
657 _mm_store_ss(ptrA+8, _tA); \
658 _mm_store_ss(ptrB+8, _tB); \
659 _mm_store_ss(ptrC+8, _tC); \
660 _mm_store_ss(ptrD+8, _tD); \
663 /* Real function for sane compilers */
664 static gmx_inline void
665 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
666 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
667 __m256 x1, __m256 y1, __m256 z1,
668 __m256 x2, __m256 y2, __m256 z2,
669 __m256 x3, __m256 y3, __m256 z3)
671 __m256 t1, t2, t3, t4, t5, t6;
672 __m128 tA, tB, tC, tD;
674 t1 = _mm256_loadu_ps(ptrA);
675 t2 = _mm256_loadu_ps(ptrB);
676 t3 = _mm256_loadu_ps(ptrC);
677 t4 = _mm256_loadu_ps(ptrD);
678 tA = _mm_load_ss(ptrA+8);
679 tB = _mm_load_ss(ptrB+8);
680 tC = _mm_load_ss(ptrC+8);
681 tD = _mm_load_ss(ptrD+8);
683 t5 = _mm256_unpacklo_ps(x1, y1); /* - - - - | y1b x1b y1a x1a */
684 x1 = _mm256_unpackhi_ps(x1, y1); /* - - - - | y1d x1d y1c x1c */
685 y1 = _mm256_unpacklo_ps(z1, x2); /* - - - - | x2b z1b x2a z1a */
686 z1 = _mm256_unpackhi_ps(z1, x2); /* - - - - | x2d z1d x2c z1c */
688 x2 = _mm256_unpacklo_ps(y2, z2); /* - - - - | z2b y2b z2a y2a */
689 y2 = _mm256_unpackhi_ps(y2, z2); /* - - - - | z2d y2d z2c y2c */
690 t6 = _mm256_unpacklo_ps(x3, y3); /* - - - - | y3b x3b y3a x3a */
691 x3 = _mm256_unpackhi_ps(x3, y3); /* - - - - | y3d x3d y3c x3c */
693 t5 = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
694 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
696 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(t6), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
697 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
699 z2 = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
700 t5 = _mm256_shuffle_ps(t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
701 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
702 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
704 t1 = _mm256_sub_ps(t1, z2);
705 t2 = _mm256_sub_ps(t2, t5);
706 t3 = _mm256_sub_ps(t3, y1);
707 t4 = _mm256_sub_ps(t4, x1);
709 tA = _mm_sub_ss(tA, _mm256_castps256_ps128(z3));
710 tB = _mm_sub_ss(tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1)));
711 tC = _mm_sub_ss(tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2)));
712 tD = _mm_sub_ss(tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3)));
714 /* Here we store a full 256-bit value and a separate 32-bit one; no overlap can happen */
715 _mm256_storeu_ps(ptrA, t1);
716 _mm256_storeu_ps(ptrB, t2);
717 _mm256_storeu_ps(ptrC, t3);
718 _mm256_storeu_ps(ptrD, t4);
719 _mm_store_ss(ptrA+8, tA);
720 _mm_store_ss(ptrB+8, tB);
721 _mm_store_ss(ptrC+8, tC);
722 _mm_store_ss(ptrD+8, tD);
728 #if defined (_MSC_VER) && defined(_M_IX86)
729 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
730 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
731 x1, y1, z1, x2, y2, z2, x3, y3, z3, x4, y4, z4) \
733 __m256 _t1, _t2, _t3, _t4, _t5; \
734 __m128 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH; \
736 _t1 = _mm256_loadu_ps(ptrA); \
737 _t2 = _mm256_loadu_ps(ptrB); \
738 _t3 = _mm256_loadu_ps(ptrC); \
739 _t4 = _mm256_loadu_ps(ptrD); \
740 _tA = _mm_loadu_ps(ptrA+8); \
741 _tB = _mm_loadu_ps(ptrB+8); \
742 _tC = _mm_loadu_ps(ptrC+8); \
743 _tD = _mm_loadu_ps(ptrD+8); \
744 _t5 = _mm256_unpacklo_ps(x1, y1); \
745 x1 = _mm256_unpackhi_ps(x1, y1); \
746 y1 = _mm256_unpacklo_ps(z1, x2); \
747 z1 = _mm256_unpackhi_ps(z1, x2); \
748 x2 = _mm256_unpacklo_ps(y2, z2); \
749 y2 = _mm256_unpackhi_ps(y2, z2); \
750 z2 = _mm256_unpacklo_ps(x3, y3); \
751 x3 = _mm256_unpackhi_ps(x3, y3); \
752 y3 = _mm256_unpacklo_ps(z3, x4); \
753 z3 = _mm256_unpackhi_ps(z3, x4); \
754 x4 = _mm256_unpacklo_ps(y4, z4); \
755 y4 = _mm256_unpackhi_ps(y4, z4); \
756 x2 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
757 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
758 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); \
759 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
760 z2 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
761 _t5 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
762 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
763 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
764 _tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); \
765 _tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); \
766 _tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); \
767 _tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); \
768 _t1 = _mm256_sub_ps(_t1, z2); \
769 _t2 = _mm256_sub_ps(_t2, _t5); \
770 _t3 = _mm256_sub_ps(_t3, y1); \
771 _t4 = _mm256_sub_ps(_t4, x1); \
772 _tA = _mm_sub_ps(_tA, _tE); \
773 _tB = _mm_sub_ps(_tB, _tF); \
774 _tC = _mm_sub_ps(_tC, _tG); \
775 _tD = _mm_sub_ps(_tD, _tH); \
776 _mm256_storeu_ps(ptrA, _t1); \
777 _mm256_storeu_ps(ptrB, _t2); \
778 _mm256_storeu_ps(ptrC, _t3); \
779 _mm256_storeu_ps(ptrD, _t4); \
780 _mm_storeu_ps(ptrA+8, _tA); \
781 _mm_storeu_ps(ptrB+8, _tB); \
782 _mm_storeu_ps(ptrC+8, _tC); \
783 _mm_storeu_ps(ptrD+8, _tD); \
786 /* Real function for sane compilers */
787 static gmx_inline void
788 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
789 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
790 __m256 x1, __m256 y1, __m256 z1,
791 __m256 x2, __m256 y2, __m256 z2,
792 __m256 x3, __m256 y3, __m256 z3,
793 __m256 x4, __m256 y4, __m256 z4)
795 __m256 t1, t2, t3, t4, t5;
796 __m128 tA, tB, tC, tD, tE, tF, tG, tH;
798 t1 = _mm256_loadu_ps(ptrA);
799 t2 = _mm256_loadu_ps(ptrB);
800 t3 = _mm256_loadu_ps(ptrC);
801 t4 = _mm256_loadu_ps(ptrD);
802 tA = _mm_loadu_ps(ptrA+8);
803 tB = _mm_loadu_ps(ptrB+8);
804 tC = _mm_loadu_ps(ptrC+8);
805 tD = _mm_loadu_ps(ptrD+8);
807 t5 = _mm256_unpacklo_ps(x1, y1); /* - - - - | y1b x1b y1a x1a */
808 x1 = _mm256_unpackhi_ps(x1, y1); /* - - - - | y1d x1d y1c x1c */
809 y1 = _mm256_unpacklo_ps(z1, x2); /* - - - - | x2b z1b x2a z1a */
810 z1 = _mm256_unpackhi_ps(z1, x2); /* - - - - | x2d z1d x2c z1c */
812 x2 = _mm256_unpacklo_ps(y2, z2); /* - - - - | z2b y2b z2a y2a */
813 y2 = _mm256_unpackhi_ps(y2, z2); /* - - - - | z2d y2d z2c y2c */
814 z2 = _mm256_unpacklo_ps(x3, y3); /* - - - - | y3b x3b y3a x3a */
815 x3 = _mm256_unpackhi_ps(x3, y3); /* - - - - | y3d x3d y3c x3c */
817 y3 = _mm256_unpacklo_ps(z3, x4); /* - - - - | x4b z3b x4a z3a */
818 z3 = _mm256_unpackhi_ps(z3, x4); /* - - - - | x4d z3d x4c z3c */
819 x4 = _mm256_unpacklo_ps(y4, z4); /* - - - - | z4b y4b z4a y4a */
820 y4 = _mm256_unpackhi_ps(y4, z4); /* - - - - | z4d y4d z4c y4c */
822 x2 = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
823 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
824 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
825 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
827 z2 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
828 t5 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
829 y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
830 x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
832 tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4a y4a x4a z3a */
833 tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4b y4b x4b z3b */
835 tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); /* z4c y4c x4c z3c */
836 tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); /* z4d y4d x4d z3d */
838 t1 = _mm256_sub_ps(t1, z2);
839 t2 = _mm256_sub_ps(t2, t5);
840 t3 = _mm256_sub_ps(t3, y1);
841 t4 = _mm256_sub_ps(t4, x1);
843 tA = _mm_sub_ps(tA, tE);
844 tB = _mm_sub_ps(tB, tF);
845 tC = _mm_sub_ps(tC, tG);
846 tD = _mm_sub_ps(tD, tH);
848 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
849 _mm256_storeu_ps(ptrA, t1);
850 _mm256_storeu_ps(ptrB, t2);
851 _mm256_storeu_ps(ptrC, t3);
852 _mm256_storeu_ps(ptrD, t4);
853 _mm_storeu_ps(ptrA+8, tA);
854 _mm_storeu_ps(ptrB+8, tB);
855 _mm_storeu_ps(ptrC+8, tC);
856 _mm_storeu_ps(ptrD+8, tD);
861 static gmx_inline void
862 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
863 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
864 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
865 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
866 __m256 x1, __m256 y1, __m256 z1)
868 __m256 t1, t2, t3, t4, t5, t6;
869 __m256 tA, tB, tC, tD;
872 /* Construct a mask without executing any data loads */
873 mask = _mm_blend_epi16(_mm_setzero_si128(), _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()), 0x3F);
875 tA = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE, mask), gmx_mm_maskload_ps(ptrA, mask));
876 tB = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF, mask), gmx_mm_maskload_ps(ptrB, mask));
877 tC = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG, mask), gmx_mm_maskload_ps(ptrC, mask));
878 tD = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH, mask), gmx_mm_maskload_ps(ptrD, mask));
879 t1 = _mm256_unpacklo_ps(x1, y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
880 t2 = _mm256_unpackhi_ps(x1, y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
882 t3 = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 0, 1, 0)); /* - z1e y1e x1e | - z1a y1a x1a */
883 t4 = _mm256_shuffle_ps(t1, z1, _MM_SHUFFLE(0, 1, 3, 2)); /* - z1f y1f x1f | - z1b y1b x1b */
884 t5 = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 2, 1, 0)); /* - z1g y1g x1g | - z1c y1c x1c */
885 t6 = _mm256_shuffle_ps(t2, z1, _MM_SHUFFLE(0, 3, 3, 2)); /* - z1h y1h x1h | - z1d y1d x1d */
887 tA = _mm256_sub_ps(tA, t3);
888 tB = _mm256_sub_ps(tB, t4);
889 tC = _mm256_sub_ps(tC, t5);
890 tD = _mm256_sub_ps(tD, t6);
892 gmx_mm_maskstore_ps(ptrA, mask, _mm256_castps256_ps128(tA));
893 gmx_mm_maskstore_ps(ptrB, mask, _mm256_castps256_ps128(tB));
894 gmx_mm_maskstore_ps(ptrC, mask, _mm256_castps256_ps128(tC));
895 gmx_mm_maskstore_ps(ptrD, mask, _mm256_castps256_ps128(tD));
896 gmx_mm_maskstore_ps(ptrE, mask, _mm256_extractf128_ps(tA, 0x1));
897 gmx_mm_maskstore_ps(ptrF, mask, _mm256_extractf128_ps(tB, 0x1));
898 gmx_mm_maskstore_ps(ptrG, mask, _mm256_extractf128_ps(tC, 0x1));
899 gmx_mm_maskstore_ps(ptrH, mask, _mm256_extractf128_ps(tD, 0x1));
904 #if defined (_MSC_VER) && defined(_M_IX86)
905 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
906 #define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
908 __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
909 __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
911 _tA = _mm256_loadu_ps(ptrA); \
912 _tB = _mm256_loadu_ps(ptrB); \
913 _tC = _mm256_loadu_ps(ptrC); \
914 _tD = _mm256_loadu_ps(ptrD); \
915 _tE = _mm256_loadu_ps(ptrE); \
916 _tF = _mm256_loadu_ps(ptrF); \
917 _tG = _mm256_loadu_ps(ptrG); \
918 _tH = _mm256_loadu_ps(ptrH); \
919 _t1 = _mm256_unpacklo_ps(_x1, _y1); \
920 _t2 = _mm256_unpackhi_ps(_x1, _y1); \
921 _t3 = _mm256_unpacklo_ps(_z1, _x2); \
922 _t4 = _mm256_unpackhi_ps(_z1, _x2); \
923 _t5 = _mm256_unpacklo_ps(_y2, _z2); \
924 _t6 = _mm256_unpackhi_ps(_y2, _z2); \
925 _t7 = _mm256_unpacklo_ps(_x3, _y3); \
926 _t8 = _mm256_unpackhi_ps(_x3, _y3); \
927 _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
928 _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
929 _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
930 _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
931 _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
932 _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
933 _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
934 _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
935 _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
936 _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
937 _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
938 _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
939 _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
940 _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
941 _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
942 _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
943 _tA = _mm256_sub_ps(_tA, _t5); \
944 _tB = _mm256_sub_ps(_tB, _t7); \
945 _tC = _mm256_sub_ps(_tC, _t1); \
946 _tD = _mm256_sub_ps(_tD, _t9); \
947 _tE = _mm256_sub_ps(_tE, _t6); \
948 _tF = _mm256_sub_ps(_tF, _t8); \
949 _tG = _mm256_sub_ps(_tG, _t2); \
950 _tH = _mm256_sub_ps(_tH, _t10); \
951 _mm256_storeu_ps(ptrA, _tA); \
952 _mm256_storeu_ps(ptrB, _tB); \
953 _mm256_storeu_ps(ptrC, _tC); \
954 _mm256_storeu_ps(ptrD, _tD); \
955 _mm256_storeu_ps(ptrE, _tE); \
956 _mm256_storeu_ps(ptrF, _tF); \
957 _mm256_storeu_ps(ptrG, _tG); \
958 _mm256_storeu_ps(ptrH, _tH); \
959 _tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8)); \
960 _tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8)); \
961 _tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8)); \
962 _tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8)); \
963 _tI = _mm256_unpacklo_ps(_tI, _tK); \
964 _tJ = _mm256_unpacklo_ps(_tJ, _tL); \
965 _tI = _mm256_unpacklo_ps(_tI, _tJ); \
966 _tI = _mm256_sub_ps(_tI, _z3); \
967 _tJ = _mm256_permute_ps(_tI, _MM_SHUFFLE(1, 1, 1, 1)); \
968 _tK = _mm256_permute_ps(_tI, _MM_SHUFFLE(2, 2, 2, 2)); \
969 _tL = _mm256_permute_ps(_tI, _MM_SHUFFLE(3, 3, 3, 3)); \
970 _mm_store_ss(ptrA+8, _mm256_castps256_ps128(_tI)); \
971 _mm_store_ss(ptrB+8, _mm256_castps256_ps128(_tJ)); \
972 _mm_store_ss(ptrC+8, _mm256_castps256_ps128(_tK)); \
973 _mm_store_ss(ptrD+8, _mm256_castps256_ps128(_tL)); \
974 _mm_store_ss(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
975 _mm_store_ss(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
976 _mm_store_ss(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
977 _mm_store_ss(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
980 /* Real function for sane compilers */
981 static gmx_inline void
982 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
983 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
984 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
985 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
986 __m256 x1, __m256 y1, __m256 z1,
987 __m256 x2, __m256 y2, __m256 z2,
988 __m256 x3, __m256 y3, __m256 z3)
990 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
991 __m256 tA, tB, tC, tD, tE, tF, tG, tH;
992 __m256 tI, tJ, tK, tL;
994 tA = _mm256_loadu_ps(ptrA);
995 tB = _mm256_loadu_ps(ptrB);
996 tC = _mm256_loadu_ps(ptrC);
997 tD = _mm256_loadu_ps(ptrD);
998 tE = _mm256_loadu_ps(ptrE);
999 tF = _mm256_loadu_ps(ptrF);
1000 tG = _mm256_loadu_ps(ptrG);
1001 tH = _mm256_loadu_ps(ptrH);
1003 t1 = _mm256_unpacklo_ps(x1, y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1004 t2 = _mm256_unpackhi_ps(x1, y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1005 t3 = _mm256_unpacklo_ps(z1, x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1006 t4 = _mm256_unpackhi_ps(z1, x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1008 t5 = _mm256_unpacklo_ps(y2, z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1009 t6 = _mm256_unpackhi_ps(y2, z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1010 t7 = _mm256_unpacklo_ps(x3, y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1011 t8 = _mm256_unpackhi_ps(x3, y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1013 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1014 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1015 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1016 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1018 t1 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1019 t2 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1020 t3 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1021 t4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1023 t5 = gmx_mm256_unpack128lo_ps(t9, t1); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1024 t6 = gmx_mm256_unpack128hi_ps(t9, t1); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1025 t7 = gmx_mm256_unpack128lo_ps(t10, t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1026 t8 = gmx_mm256_unpack128hi_ps(t10, t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1027 t1 = gmx_mm256_unpack128lo_ps(t11, t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1028 t2 = gmx_mm256_unpack128hi_ps(t11, t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1029 t9 = gmx_mm256_unpack128lo_ps(t12, t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1030 t10 = gmx_mm256_unpack128hi_ps(t12, t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1032 tA = _mm256_sub_ps(tA, t5);
1033 tB = _mm256_sub_ps(tB, t7);
1034 tC = _mm256_sub_ps(tC, t1);
1035 tD = _mm256_sub_ps(tD, t9);
1036 tE = _mm256_sub_ps(tE, t6);
1037 tF = _mm256_sub_ps(tF, t8);
1038 tG = _mm256_sub_ps(tG, t2);
1039 tH = _mm256_sub_ps(tH, t10);
1041 _mm256_storeu_ps(ptrA, tA);
1042 _mm256_storeu_ps(ptrB, tB);
1043 _mm256_storeu_ps(ptrC, tC);
1044 _mm256_storeu_ps(ptrD, tD);
1045 _mm256_storeu_ps(ptrE, tE);
1046 _mm256_storeu_ps(ptrF, tF);
1047 _mm256_storeu_ps(ptrG, tG);
1048 _mm256_storeu_ps(ptrH, tH);
1050 tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8));
1051 tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8));
1052 tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8));
1053 tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8));
1055 tI = _mm256_unpacklo_ps(tI, tK); /* - - zG zE | - - zC zA */
1056 tJ = _mm256_unpacklo_ps(tJ, tL); /* - - zH zF | - - zD zB */
1057 tI = _mm256_unpacklo_ps(tI, tJ); /* zH zG zF zE | zD zC zB zA */
1059 tI = _mm256_sub_ps(tI, z3);
1060 tJ = _mm256_permute_ps(tI, _MM_SHUFFLE(1, 1, 1, 1));
1061 tK = _mm256_permute_ps(tI, _MM_SHUFFLE(2, 2, 2, 2));
1062 tL = _mm256_permute_ps(tI, _MM_SHUFFLE(3, 3, 3, 3));
1064 _mm_store_ss(ptrA+8, _mm256_castps256_ps128(tI));
1065 _mm_store_ss(ptrB+8, _mm256_castps256_ps128(tJ));
1066 _mm_store_ss(ptrC+8, _mm256_castps256_ps128(tK));
1067 _mm_store_ss(ptrD+8, _mm256_castps256_ps128(tL));
1068 _mm_store_ss(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1069 _mm_store_ss(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1070 _mm_store_ss(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1071 _mm_store_ss(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1077 #if defined (_MSC_VER) && defined(_M_IX86)
1078 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1079 #define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
1080 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
1082 __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
1083 __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
1085 _tA = _mm256_loadu_ps(ptrA); \
1086 _tB = _mm256_loadu_ps(ptrB); \
1087 _tC = _mm256_loadu_ps(ptrC); \
1088 _tD = _mm256_loadu_ps(ptrD); \
1089 _tE = _mm256_loadu_ps(ptrE); \
1090 _tF = _mm256_loadu_ps(ptrF); \
1091 _tG = _mm256_loadu_ps(ptrG); \
1092 _tH = _mm256_loadu_ps(ptrH); \
1093 _t1 = _mm256_unpacklo_ps(_x1, _y1); \
1094 _t2 = _mm256_unpackhi_ps(_x1, _y1); \
1095 _t3 = _mm256_unpacklo_ps(_z1, _x2); \
1096 _t4 = _mm256_unpackhi_ps(_z1, _x2); \
1097 _t5 = _mm256_unpacklo_ps(_y2, _z2); \
1098 _t6 = _mm256_unpackhi_ps(_y2, _z2); \
1099 _t7 = _mm256_unpacklo_ps(_x3, _y3); \
1100 _t8 = _mm256_unpackhi_ps(_x3, _y3); \
1101 _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1102 _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1103 _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1104 _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1105 _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
1106 _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
1107 _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
1108 _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
1109 _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
1110 _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
1111 _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
1112 _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
1113 _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
1114 _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
1115 _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
1116 _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
1117 _tA = _mm256_sub_ps(_tA, _t5); \
1118 _tB = _mm256_sub_ps(_tB, _t7); \
1119 _tC = _mm256_sub_ps(_tC, _t1); \
1120 _tD = _mm256_sub_ps(_tD, _t9); \
1121 _tE = _mm256_sub_ps(_tE, _t6); \
1122 _tF = _mm256_sub_ps(_tF, _t8); \
1123 _tG = _mm256_sub_ps(_tG, _t2); \
1124 _tH = _mm256_sub_ps(_tH, _t10); \
1125 _mm256_storeu_ps(ptrA, _tA); \
1126 _mm256_storeu_ps(ptrB, _tB); \
1127 _mm256_storeu_ps(ptrC, _tC); \
1128 _mm256_storeu_ps(ptrD, _tD); \
1129 _mm256_storeu_ps(ptrE, _tE); \
1130 _mm256_storeu_ps(ptrF, _tF); \
1131 _mm256_storeu_ps(ptrG, _tG); \
1132 _mm256_storeu_ps(ptrH, _tH); \
1133 _tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
1134 _tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
1135 _tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
1136 _tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
1137 _t1 = _mm256_unpacklo_ps(_z3, _x4); \
1138 _t2 = _mm256_unpackhi_ps(_z3, _x4); \
1139 _t3 = _mm256_unpacklo_ps(_y4, _z4); \
1140 _t4 = _mm256_unpackhi_ps(_y4, _z4); \
1141 _t5 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
1142 _t6 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
1143 _t7 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
1144 _t8 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
1145 _tI = _mm256_sub_ps(_tI, _t5); \
1146 _tJ = _mm256_sub_ps(_tJ, _t6); \
1147 _tK = _mm256_sub_ps(_tK, _t7); \
1148 _tL = _mm256_sub_ps(_tL, _t8); \
1149 _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
1150 _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
1151 _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
1152 _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
1153 _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
1154 _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
1155 _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
1156 _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
1159 /* Real function for sane compilers */
1160 static gmx_inline void
1161 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
1162 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
1163 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
1164 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
1165 __m256 x1, __m256 y1, __m256 z1,
1166 __m256 x2, __m256 y2, __m256 z2,
1167 __m256 x3, __m256 y3, __m256 z3,
1168 __m256 x4, __m256 y4, __m256 z4)
1170 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
1171 __m256 tA, tB, tC, tD, tE, tF, tG, tH;
1172 __m256 tI, tJ, tK, tL;
1174 tA = _mm256_loadu_ps(ptrA);
1175 tB = _mm256_loadu_ps(ptrB);
1176 tC = _mm256_loadu_ps(ptrC);
1177 tD = _mm256_loadu_ps(ptrD);
1178 tE = _mm256_loadu_ps(ptrE);
1179 tF = _mm256_loadu_ps(ptrF);
1180 tG = _mm256_loadu_ps(ptrG);
1181 tH = _mm256_loadu_ps(ptrH);
1183 t1 = _mm256_unpacklo_ps(x1, y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
1184 t2 = _mm256_unpackhi_ps(x1, y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
1185 t3 = _mm256_unpacklo_ps(z1, x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
1186 t4 = _mm256_unpackhi_ps(z1, x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
1188 t5 = _mm256_unpacklo_ps(y2, z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
1189 t6 = _mm256_unpackhi_ps(y2, z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
1190 t7 = _mm256_unpacklo_ps(x3, y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
1191 t8 = _mm256_unpackhi_ps(x3, y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
1193 t9 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
1194 t10 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
1195 t11 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
1196 t12 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
1198 t1 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
1199 t2 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
1200 t3 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(1, 0, 1, 0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
1201 t4 = _mm256_shuffle_ps(t6, t8, _MM_SHUFFLE(3, 2, 3, 2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
1203 t5 = gmx_mm256_unpack128lo_ps(t9, t1); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
1204 t6 = gmx_mm256_unpack128hi_ps(t9, t1); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
1205 t7 = gmx_mm256_unpack128lo_ps(t10, t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
1206 t8 = gmx_mm256_unpack128hi_ps(t10, t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
1207 t1 = gmx_mm256_unpack128lo_ps(t11, t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
1208 t2 = gmx_mm256_unpack128hi_ps(t11, t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
1209 t9 = gmx_mm256_unpack128lo_ps(t12, t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
1210 t10 = gmx_mm256_unpack128hi_ps(t12, t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
1212 tA = _mm256_sub_ps(tA, t5);
1213 tB = _mm256_sub_ps(tB, t7);
1214 tC = _mm256_sub_ps(tC, t1);
1215 tD = _mm256_sub_ps(tD, t9);
1216 tE = _mm256_sub_ps(tE, t6);
1217 tF = _mm256_sub_ps(tF, t8);
1218 tG = _mm256_sub_ps(tG, t2);
1219 tH = _mm256_sub_ps(tH, t10);
1221 _mm256_storeu_ps(ptrA, tA);
1222 _mm256_storeu_ps(ptrB, tB);
1223 _mm256_storeu_ps(ptrC, tC);
1224 _mm256_storeu_ps(ptrD, tD);
1225 _mm256_storeu_ps(ptrE, tE);
1226 _mm256_storeu_ps(ptrF, tF);
1227 _mm256_storeu_ps(ptrG, tG);
1228 _mm256_storeu_ps(ptrH, tH);
1230 tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8));
1231 tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8));
1232 tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8));
1233 tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8));
1235 t1 = _mm256_unpacklo_ps(z3, x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
1236 t2 = _mm256_unpackhi_ps(z3, x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
1237 t3 = _mm256_unpacklo_ps(y4, z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
1238 t4 = _mm256_unpackhi_ps(y4, z4); /* z4h y4h z4g y4g | z4d y4d z4c y4c */
1240 t5 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
1241 t6 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
1242 t7 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 0, 1, 0)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
1243 t8 = _mm256_shuffle_ps(t2, t4, _MM_SHUFFLE(3, 2, 3, 2)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
1245 tI = _mm256_sub_ps(tI, t5);
1246 tJ = _mm256_sub_ps(tJ, t6);
1247 tK = _mm256_sub_ps(tK, t7);
1248 tL = _mm256_sub_ps(tL, t8);
1250 _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(tI));
1251 _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(tJ));
1252 _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(tK));
1253 _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(tL));
1254 _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(tI, 0x1));
1255 _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(tJ, 0x1));
1256 _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
1257 _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
1262 static gmx_inline void
1263 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1264 float * gmx_restrict fptr,
1265 float * gmx_restrict fshiftptr)
1269 fix1 = _mm256_hadd_ps(fix1, fix1);
1270 fiy1 = _mm256_hadd_ps(fiy1, fiz1);
1271 fix1 = _mm256_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 (in both lanes) */
1273 /* Add across the two lanes */
1274 t1 = _mm_add_ps(_mm256_castps256_ps128(fix1), _mm256_extractf128_ps(fix1, 0x1));
1276 t2 = _mm_load_ss(fptr);
1277 t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
1278 t3 = _mm_load_ss(fshiftptr);
1279 t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
1281 t2 = _mm_add_ps(t2, t1);
1282 t3 = _mm_add_ps(t3, t1);
1284 _mm_store_ss(fptr, t2);
1285 _mm_storeh_pi((__m64 *)(fptr+1), t2);
1286 _mm_store_ss(fshiftptr, t3);
1287 _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
1290 #if defined (_MSC_VER) && defined(_M_IX86)
1291 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1292 #define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
1295 __m256 _t1, _t2, _t3; \
1296 __m128 _tA, _tB, _tC; \
1298 fix1 = _mm256_hadd_ps(fix1, fiy1); \
1299 fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1300 fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1301 fix3 = _mm256_hadd_ps(fix3, fiy3); \
1302 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1303 fix1 = _mm256_hadd_ps(fix1, fiz1); \
1304 fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1305 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
1307 _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1308 _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1309 _t1 = _mm256_add_ps(_t1, _t2); \
1310 _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1311 _t3 = _mm256_loadu_ps(fptr); \
1312 _t3 = _mm256_add_ps(_t3, _t1); \
1313 _mm256_storeu_ps(fptr, _t3); \
1314 _tB = _mm_load_ss(fptr+8); \
1315 _tB = _mm_add_ss(_tB, _tA); \
1316 _mm_store_ss(fptr+8, _tB); \
1318 _tB = _mm256_extractf128_ps(_t1, 0x1); \
1319 _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1320 _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1321 _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1322 _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1323 _tA = _mm_add_ps(_tB, _tC); \
1324 _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1325 _tC = _mm_loadu_ps(fshiftptr); \
1326 _tC = _mm_add_ps(_tC, _tA); \
1327 _mm_storeu_ps(fshiftptr, _tC); \
1330 /* Real function for sane compilers */
1331 static gmx_inline void
1332 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1333 __m256 fix2, __m256 fiy2, __m256 fiz2,
1334 __m256 fix3, __m256 fiy3, __m256 fiz3,
1335 float * gmx_restrict fptr,
1336 float * gmx_restrict fshiftptr)
1341 fix1 = _mm256_hadd_ps(fix1, fiy1); /* Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1342 fiz1 = _mm256_hadd_ps(fiz1, fix2); /* X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1343 fiy2 = _mm256_hadd_ps(fiy2, fiz2); /* Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1344 fix3 = _mm256_hadd_ps(fix3, fiy3); /* Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1345 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /* 0 0 Z3g+Z3h Z3e+Z3f | 0 0 Z3c+Z3d Z3a+Z3b */
1347 fix1 = _mm256_hadd_ps(fix1, fiz1); /* X2e-h Z1e-h Y1e-h X1e-h | X2a-d Z1a-d Y1a-d X1a-d */
1348 fiy2 = _mm256_hadd_ps(fiy2, fix3); /* Y3e-h X3e-h Z2e-h Y2e-h | Y3a-d X3a-d Z2a-d Y2a-d */
1349 fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); /* 0 0 0 Z3e-h | 0 0 0 Z3a-d */
1351 /* Add across the two lanes by swapping and adding back */
1352 t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); /* Y3a-d X3a-d Z2a-d Y2a-d | X2a-d Z1a-d Y1a-d X1a-d */
1353 t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); /* Y3e-h X3e-h Z2e-h Y2e-h | X2e-h Z1e-h Y1e-h X1e-h */
1354 t1 = _mm256_add_ps(t1, t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1356 tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* 0 0 0 z3 */
1358 t3 = _mm256_loadu_ps(fptr);
1359 t3 = _mm256_add_ps(t3, t1);
1360 _mm256_storeu_ps(fptr, t3);
1361 tB = _mm_load_ss(fptr+8);
1362 tB = _mm_add_ss(tB, tA);
1363 _mm_store_ss(fptr+8, tB);
1365 /* Add up shift force */
1366 tB = _mm256_extractf128_ps(t1, 0x1); /* y3 x3 z2 y2 */
1367 tC = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1368 tB = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2)); /* 0 z3 y3 x3 */
1369 tC = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0)); /* - z2 y2 x2 */
1371 tB = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1372 tA = _mm_add_ps(tB, tC); /* - z y x */
1374 tA = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1376 tC = _mm_loadu_ps(fshiftptr);
1377 tC = _mm_add_ps(tC, tA);
1378 _mm_storeu_ps(fshiftptr, tC);
1383 #if defined (_MSC_VER) && defined(_M_IX86)
1384 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1385 #define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1388 __m256 _t1, _t2, _t3; \
1389 __m128 _tA, _tB, _tC; \
1391 fix1 = _mm256_hadd_ps(fix1, fiy1); \
1392 fiz1 = _mm256_hadd_ps(fiz1, fix2); \
1393 fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
1394 fix3 = _mm256_hadd_ps(fix3, fiy3); \
1395 fiz3 = _mm256_hadd_ps(fiz3, fix4); \
1396 fiy4 = _mm256_hadd_ps(fiy4, fiz4); \
1398 fix1 = _mm256_hadd_ps(fix1, fiz1); \
1399 fiy2 = _mm256_hadd_ps(fiy2, fix3); \
1400 fiz3 = _mm256_hadd_ps(fiz3, fiy4); \
1402 _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
1403 _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
1404 _t1 = _mm256_add_ps(_t1, _t2); \
1405 _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
1406 _t3 = _mm256_loadu_ps(fptr); \
1407 _t3 = _mm256_add_ps(_t3, _t1); \
1408 _mm256_storeu_ps(fptr, _t3); \
1409 _tB = _mm_loadu_ps(fptr+8); \
1410 _tB = _mm_add_ps(_tB, _tA); \
1411 _mm_storeu_ps(fptr+8, _tB); \
1413 _tB = _mm256_extractf128_ps(_t1, 0x1); \
1414 _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
1415 _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
1416 _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
1417 _tA = _mm_permute_ps(_tA, _MM_SHUFFLE(0, 3, 2, 1)); \
1418 _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
1419 _tA = _mm_add_ps(_tA, _tC); \
1420 _tA = _mm_add_ps(_tA, _tB); \
1421 _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
1422 _tC = _mm_loadu_ps(fshiftptr); \
1423 _tC = _mm_add_ps(_tC, _tA); \
1424 _mm_storeu_ps(fshiftptr, _tC); \
1427 /* Real function for sane compilers */
1428 static gmx_inline void
1429 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1430 __m256 fix2, __m256 fiy2, __m256 fiz2,
1431 __m256 fix3, __m256 fiy3, __m256 fiz3,
1432 __m256 fix4, __m256 fiy4, __m256 fiz4,
1433 float * gmx_restrict fptr,
1434 float * gmx_restrict fshiftptr)
1439 fix1 = _mm256_hadd_ps(fix1, fiy1); /* Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1440 fiz1 = _mm256_hadd_ps(fiz1, fix2); /* X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1441 fiy2 = _mm256_hadd_ps(fiy2, fiz2); /* Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1442 fix3 = _mm256_hadd_ps(fix3, fiy3); /* Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1443 fiz3 = _mm256_hadd_ps(fiz3, fix4); /* X4g+X4h X4e+X4f Z3g+Z3h Z3e+Z3f | X4c+X4d X4a+X4b Z3c+Z3d Z3a+Z3b */
1444 fiy4 = _mm256_hadd_ps(fiy4, fiz4); /* Z4g+Z4h Z4e+Z4f Y4g+Y4h Y4e+Y4f | Z4c+Z4d Z4a+Z4b Y4c+Y4d Y4a+Y4b */
1446 fix1 = _mm256_hadd_ps(fix1, fiz1); /* X2e-h Z1e-h Y1e-h X1e-h | X2a-d Z1a-d Y1a-d X1a-d */
1447 fiy2 = _mm256_hadd_ps(fiy2, fix3); /* Y3e-h X3e-h Z2e-h Y2e-h | Y3a-d X3a-d Z2a-d Y2a-d */
1448 fiz3 = _mm256_hadd_ps(fiz3, fiy4); /* Z4e-h Y4e-h X4e-h Z3e-h | Z4a-d Y4a-d X4a-d Z3a-d */
1450 /* Add across the two lanes by swapping and adding back */
1451 t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); /* Y3a-d X3a-d Z2a-d Y2a-d | X2a-d Z1a-d Y1a-d X1a-d */
1452 t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); /* Y3e-h X3e-h Z2e-h Y2e-h | X2e-h Z1e-h Y1e-h X1e-h */
1453 t1 = _mm256_add_ps(t1, t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1455 tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); /* z4 y4 x4 z3 */
1457 t3 = _mm256_loadu_ps(fptr);
1458 t3 = _mm256_add_ps(t3, t1);
1459 _mm256_storeu_ps(fptr, t3);
1461 tB = _mm_loadu_ps(fptr+8);
1462 tB = _mm_add_ps(tB, tA);
1463 _mm_storeu_ps(fptr+8, tB);
1465 /* Add up shift force */
1466 tB = _mm256_extractf128_ps(t1, 0x1); /* y3 x3 z2 y2 */
1467 tC = _mm_shuffle_ps(_mm256_castps256_ps128(t1), tB, _MM_SHUFFLE(1, 0, 3, 3)); /* z2 y2 x2 x2 */
1468 tB = _mm_shuffle_ps(tB, tA, _MM_SHUFFLE(1, 0, 3, 2)); /* 0 z3 y3 x3 */
1469 tC = _mm_permute_ps(tC, _MM_SHUFFLE(3, 3, 2, 0)); /* - z2 y2 x2 */
1470 tA = _mm_permute_ps(tA, _MM_SHUFFLE(0, 3, 2, 1)); /* - z4 y4 x4 */
1472 tB = _mm_add_ps(tB, _mm256_castps256_ps128(t1));
1473 tA = _mm_add_ps(tA, tC);
1474 tA = _mm_add_ps(tA, tB);
1476 tA = _mm_blend_ps(_mm_setzero_ps(), tA, 0x7); /* 0 z y x */
1478 tC = _mm_loadu_ps(fshiftptr);
1479 tC = _mm_add_ps(tC, tA);
1480 _mm_storeu_ps(fshiftptr, tC);
1486 static gmx_inline void
1487 gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
1491 pot1 = _mm256_hadd_ps(pot1, pot1);
1492 pot1 = _mm256_hadd_ps(pot1, pot1);
1494 t1 = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1496 _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1499 static gmx_inline void
1500 gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1501 __m256 pot2, float * gmx_restrict ptrB)
1505 pot1 = _mm256_hadd_ps(pot1, pot2);
1506 pot1 = _mm256_hadd_ps(pot1, pot1);
1508 t1 = _mm_add_ps(_mm256_castps256_ps128(pot1), _mm256_extractf128_ps(pot1, 0x1));
1510 t2 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
1511 _mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
1512 _mm_store_ss(ptrB, _mm_add_ss(_mm_load_ss(ptrB), t2));
1516 #endif /* _kernelutil_x86_avx_256_single_h_ */