2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_256_double_h_
36 #define _kernelutil_x86_avx_256_double_h_
38 #define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
40 #define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
41 #define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
42 #define _GMX_MM_PERMUTE128D(fp1, fp0) (((fp1) << 1) | ((fp0)))
43 #define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
44 #define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
46 __m256d _t0, _t1, _t2, _t3; \
47 _t0 = _mm256_unpacklo_pd((row0), (row1)); \
48 _t1 = _mm256_unpackhi_pd((row0), (row1)); \
49 _t2 = _mm256_unpacklo_pd((row2), (row3)); \
50 _t3 = _mm256_unpackhi_pd((row2), (row3)); \
51 row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20); \
52 row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20); \
53 row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31); \
54 row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31); \
57 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
60 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
62 return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
66 gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
68 return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
72 gmx_mm256_set_m128d(__m128d hi, __m128d lo)
74 return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
77 static gmx_inline __m256
78 gmx_mm256_set_m128(__m128 hi, __m128 lo)
80 return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
84 gmx_mm256_any_lt(__m256d a, __m256d b)
86 return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
89 static gmx_inline __m256d
90 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
92 return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
95 /* Normal sum of four ymm registers */
96 #define gmx_mm256_sum4_pd(t0, t1, t2, t3) _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
99 /* Load a single value from 1-4 places, merge into xmm register */
101 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
103 return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
107 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
108 const double * gmx_restrict ptrB)
112 tA = _mm_load_sd(ptrA);
113 tB = _mm_load_sd(ptrB);
115 return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
120 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
121 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
125 t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
126 t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
127 return gmx_mm256_set_m128d(t2, t1);
133 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
135 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
140 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
144 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
145 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
146 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
153 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
154 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
159 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
160 t3 = _mm256_extractf128_pd(xmm1, 0x1);
161 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
162 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
163 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
164 _mm_store_sd(ptrC, t3);
165 _mm_store_sd(ptrD, t4);
172 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
176 t1 = _mm256_castpd256_pd128(xmm1);
177 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
179 _mm_store_sd(ptrA, t1);
184 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
188 t1 = _mm256_castpd256_pd128(xmm1);
189 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
191 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
192 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
194 _mm_store_sd(ptrA, t1);
195 _mm_store_sd(ptrB, t2);
200 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
201 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
203 __m128d t1, t2, t3, t4;
205 t1 = _mm256_castpd256_pd128(xmm1);
206 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
207 t3 = _mm256_extractf128_pd(xmm1, 0x1);
208 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
210 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
211 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
212 t3 = _mm_add_sd(t3, _mm_load_sd(ptrC));
213 t4 = _mm_add_sd(t4, _mm_load_sd(ptrD));
215 _mm_store_sd(ptrA, t1);
216 _mm_store_sd(ptrB, t2);
217 _mm_store_sd(ptrC, t3);
218 _mm_store_sd(ptrD, t4);
224 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
226 *c6 = _mm256_castpd128_pd256(_mm_load_sd(p1));
227 *c12 = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
232 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
236 t1 = _mm_loadu_pd(p1);
237 t2 = _mm_loadu_pd(p2);
238 *c6 = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
239 *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
245 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
246 const double * gmx_restrict p3, const double * gmx_restrict p4,
247 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
251 t1 = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
252 t2 = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
254 *c6 = _mm256_unpacklo_pd(t1, t2); /* c6d c6c | c6b c6a */
255 *c12 = _mm256_unpackhi_pd(t1, t2); /* c12d c12c | c12b c12a */
259 static gmx_inline void
260 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
261 const double * gmx_restrict xyz,
262 __m256d * gmx_restrict x1,
263 __m256d * gmx_restrict y1,
264 __m256d * gmx_restrict z1)
266 __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
268 mem_xy = _mm_loadu_pd(xyz);
269 mem_z = _mm_load_sd(xyz+2);
270 mem_sxy = _mm_loadu_pd(xyz_shift);
271 mem_sz = _mm_load_sd(xyz_shift+2);
273 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
274 mem_z = _mm_add_pd(mem_z, mem_sz);
276 tx = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
277 ty = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
278 tz = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
280 *x1 = gmx_mm256_set_m128d(tx, tx);
281 *y1 = gmx_mm256_set_m128d(ty, ty);
282 *z1 = gmx_mm256_set_m128d(tz, tz);
286 static gmx_inline void
287 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
288 const double * gmx_restrict xyz,
289 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
290 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
291 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
293 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
295 t1 = _mm_loadu_pd(xyz);
296 t2 = _mm_loadu_pd(xyz+2);
297 t3 = _mm_loadu_pd(xyz+4);
298 t4 = _mm_loadu_pd(xyz+6);
299 t5 = _mm_load_sd(xyz+8);
301 sxy = _mm_loadu_pd(xyz_shift);
302 sz = _mm_load_sd(xyz_shift+2);
303 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
304 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
306 t1 = _mm_add_pd(t1, sxy);
307 t2 = _mm_add_pd(t2, szx);
308 t3 = _mm_add_pd(t3, syz);
309 t4 = _mm_add_pd(t4, sxy);
310 t5 = _mm_add_sd(t5, sz);
312 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
313 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
314 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
315 *x1 = gmx_mm256_set_m128d(tx, tx);
316 *y1 = gmx_mm256_set_m128d(ty, ty);
317 *z1 = gmx_mm256_set_m128d(tz, tz);
318 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
319 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
320 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
321 *x2 = gmx_mm256_set_m128d(tx, tx);
322 *y2 = gmx_mm256_set_m128d(ty, ty);
323 *z2 = gmx_mm256_set_m128d(tz, tz);
324 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
325 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
326 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
327 *x3 = gmx_mm256_set_m128d(tx, tx);
328 *y3 = gmx_mm256_set_m128d(ty, ty);
329 *z3 = gmx_mm256_set_m128d(tz, tz);
333 static gmx_inline void
334 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
335 const double * gmx_restrict xyz,
336 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
337 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
338 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
339 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
341 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
343 t1 = _mm_loadu_pd(xyz);
344 t2 = _mm_loadu_pd(xyz+2);
345 t3 = _mm_loadu_pd(xyz+4);
346 t4 = _mm_loadu_pd(xyz+6);
347 t5 = _mm_loadu_pd(xyz+8);
348 t6 = _mm_loadu_pd(xyz+10);
350 sxy = _mm_loadu_pd(xyz_shift);
351 sz = _mm_load_sd(xyz_shift+2);
352 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
353 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
355 t1 = _mm_add_pd(t1, sxy);
356 t2 = _mm_add_pd(t2, szx);
357 t3 = _mm_add_pd(t3, syz);
358 t4 = _mm_add_pd(t4, sxy);
359 t5 = _mm_add_pd(t5, szx);
360 t6 = _mm_add_pd(t6, syz);
362 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
363 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
364 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
365 *x1 = gmx_mm256_set_m128d(tx, tx);
366 *y1 = gmx_mm256_set_m128d(ty, ty);
367 *z1 = gmx_mm256_set_m128d(tz, tz);
368 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
369 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
370 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
371 *x2 = gmx_mm256_set_m128d(tx, tx);
372 *y2 = gmx_mm256_set_m128d(ty, ty);
373 *z2 = gmx_mm256_set_m128d(tz, tz);
374 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
375 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
376 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
377 *x3 = gmx_mm256_set_m128d(tx, tx);
378 *y3 = gmx_mm256_set_m128d(ty, ty);
379 *z3 = gmx_mm256_set_m128d(tz, tz);
380 tx = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
381 ty = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
382 tz = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
383 *x4 = gmx_mm256_set_m128d(tx, tx);
384 *y4 = gmx_mm256_set_m128d(ty, ty);
385 *z4 = gmx_mm256_set_m128d(tz, tz);
390 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
391 __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
395 t1 = _mm256_loadu_pd(p1);
397 *y = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
398 *z = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
403 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
404 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
405 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
406 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
408 __m256d t1, t2, t3, t4;
410 t1 = _mm256_loadu_pd(p1);
411 t3 = _mm256_loadu_pd(p1+4);
414 t2 = gmx_mm256_unpack128hi_pd(t1, t1);
415 t4 = gmx_mm256_unpack128hi_pd(t3, t3);
418 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
419 *z2 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
420 *x2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
421 *y3 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
422 *z3 = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
426 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
427 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
428 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
429 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
430 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
432 __m256d t1, t2, t3, t4, t5, t6;
434 t1 = _mm256_loadu_pd(p1);
435 t2 = _mm256_loadu_pd(p1+4);
436 t3 = _mm256_loadu_pd(p1+8);
438 t4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
439 t5 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
440 t6 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
449 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
450 *z2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
451 *x4 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
452 *x2 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
453 *y3 = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
454 *z4 = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
459 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
460 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
461 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
463 __m256d t1, t2, t3, t4, t5, t6;
465 t1 = _mm256_loadu_pd(ptrA); /* - z1a | y1a x1a */
466 t2 = _mm256_loadu_pd(ptrB); /* - z1b | y1b x1b */
467 t3 = _mm256_loadu_pd(ptrC); /* - z1c | y1c x1c */
468 t4 = _mm256_loadu_pd(ptrD); /* - z1d | y1d x1d */
470 t5 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
471 t6 = _mm256_unpackhi_pd(t1, t2); /* - - | y1b y1a */
472 t1 = _mm256_unpacklo_pd(t3, t4); /* z1c z1c | x1d x1c */
473 t2 = _mm256_unpackhi_pd(t3, t4); /* - - | y1d y1c */
475 *x1 = gmx_mm256_unpack128lo_pd(t5, t1);
476 *y1 = gmx_mm256_unpack128lo_pd(t6, t2);
477 *z1 = gmx_mm256_unpack128hi_pd(t5, t1);
483 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
484 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
485 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
486 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
487 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
489 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
491 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
492 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
493 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
494 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
495 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
496 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
497 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
498 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
499 t9 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
500 t10 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
501 t11 = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /* - - | - z3c */
502 t12 = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /* - - | - z3d */
504 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
505 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
506 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
507 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
509 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
510 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
511 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
512 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
514 t9 = _mm256_unpacklo_pd(t9, t10); /* - - | z3b z3a */
515 t11 = _mm256_unpacklo_pd(t11, t12); /* - - | z3d z3c */
517 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
518 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
519 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
520 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
521 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
522 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
523 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
524 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
525 *z3 = gmx_mm256_unpack128lo_pd(t9, t11);
531 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
532 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
533 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
534 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
535 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
536 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
538 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
540 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
541 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
542 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
543 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
544 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
545 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
546 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
547 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
548 t9 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
549 t10 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
550 t11 = _mm256_loadu_pd(ptrC+8); /* z4c y4c | x4c z3c */
551 t12 = _mm256_loadu_pd(ptrD+8); /* z4d y4d | x4d z3d */
553 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
554 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
555 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
556 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
558 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
559 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
560 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
561 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
563 t7 = _mm256_unpacklo_pd(t9, t10); /* y4b y4a | z3b z3a */
564 t8 = _mm256_unpackhi_pd(t9, t10); /* z4b z4a | x4b x4a */
565 t9 = _mm256_unpacklo_pd(t11, t12); /* y4d y4c | z3d z3c */
566 t10 = _mm256_unpackhi_pd(t11, t12); /* z4d z4c | x4d x4c */
568 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
569 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
570 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
571 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
572 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
573 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
574 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
575 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
576 *z3 = gmx_mm256_unpack128lo_pd(t7, t9);
577 *x4 = gmx_mm256_unpack128lo_pd(t8, t10);
578 *y4 = gmx_mm256_unpack128hi_pd(t7, t9);
579 *z4 = gmx_mm256_unpack128hi_pd(t8, t10);
585 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
586 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
587 __m256d x1, __m256d y1, __m256d z1)
589 __m256d t1, t2, tA, tB, tC, tD;
592 t1 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
593 t2 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
594 x1 = gmx_mm256_unpack128lo_pd(t1, z1); /* - z1a | y1a x1a */
595 y1 = gmx_mm256_unpack128hi_pd(t1, z1); /* - z1c | y1c x1c */
596 z1 = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
597 t1 = gmx_mm256_unpack128lo_pd(t2, z1); /* - z1b | y1b x1b */
598 z1 = gmx_mm256_unpack128hi_pd(t2, z1); /* - z1d | y1d x1d */
600 /* Construct a mask without executing any data loads */
601 mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
602 _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
604 tA = _mm256_loadu_pd(ptrA);
605 tB = _mm256_loadu_pd(ptrB);
606 tC = _mm256_loadu_pd(ptrC);
607 tD = _mm256_loadu_pd(ptrD);
609 tA = _mm256_sub_pd(tA, x1);
610 tB = _mm256_sub_pd(tB, t1);
611 tC = _mm256_sub_pd(tC, y1);
612 tD = _mm256_sub_pd(tD, z1);
614 _mm256_maskstore_pd(ptrA, mask, tA);
615 _mm256_maskstore_pd(ptrB, mask, tB);
616 _mm256_maskstore_pd(ptrC, mask, tC);
617 _mm256_maskstore_pd(ptrD, mask, tD);
622 #if defined (_MSC_VER) && defined(_M_IX86)
623 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
624 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
625 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
627 __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
628 __m128d _tA, _tB, _tC, _tD, _tE; \
629 _t1 = _mm256_loadu_pd(ptrA); \
630 _t2 = _mm256_loadu_pd(ptrB); \
631 _t3 = _mm256_loadu_pd(ptrC); \
632 _t4 = _mm256_loadu_pd(ptrD); \
633 _t5 = _mm256_loadu_pd(ptrA+4); \
634 _t6 = _mm256_loadu_pd(ptrB+4); \
635 _t7 = _mm256_loadu_pd(ptrC+4); \
636 _t8 = _mm256_loadu_pd(ptrD+4); \
637 _tA = _mm_load_sd(ptrA+8); \
638 _tB = _mm_load_sd(ptrB+8); \
639 _tC = _mm_load_sd(ptrC+8); \
640 _tD = _mm_load_sd(ptrD+8); \
641 _t9 = _mm256_unpacklo_pd(_x1, _y1); \
642 _x1 = _mm256_unpackhi_pd(_x1, _y1); \
643 _y1 = _mm256_unpacklo_pd(_z1, _x2); \
644 _z1 = _mm256_unpackhi_pd(_z1, _x2); \
645 _x2 = _mm256_unpacklo_pd(_y2, _z2); \
646 _y2 = _mm256_unpackhi_pd(_y2, _z2); \
647 _z2 = _mm256_unpacklo_pd(_x3, _y3); \
648 _x3 = _mm256_unpackhi_pd(_x3, _y3); \
649 _t10 = gmx_mm256_unpack128lo_pd(_t9, _y1); \
650 _y3 = gmx_mm256_unpack128hi_pd(_t9, _y1); \
651 _t9 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
652 _y1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
653 _x1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
654 _z1 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
655 _x2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
656 _z2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
657 _t1 = _mm256_sub_pd(_t1, _t10); \
658 _t2 = _mm256_sub_pd(_t2, _t9); \
659 _t3 = _mm256_sub_pd(_t3, _y3); \
660 _t4 = _mm256_sub_pd(_t4, _y1); \
661 _t5 = _mm256_sub_pd(_t5, _x1); \
662 _t6 = _mm256_sub_pd(_t6, _x2); \
663 _t7 = _mm256_sub_pd(_t7, _z1); \
664 _t8 = _mm256_sub_pd(_t8, _z2); \
665 _tA = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
666 _tB = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
667 _tE = _mm256_extractf128_pd(_z3, 0x1); \
668 _tC = _mm_sub_sd(_tC, _tE); \
669 _tD = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
670 _mm256_storeu_pd(ptrA, _t1); \
671 _mm256_storeu_pd(ptrB, _t2); \
672 _mm256_storeu_pd(ptrC, _t3); \
673 _mm256_storeu_pd(ptrD, _t4); \
674 _mm256_storeu_pd(ptrA+4, _t5); \
675 _mm256_storeu_pd(ptrB+4, _t6); \
676 _mm256_storeu_pd(ptrC+4, _t7); \
677 _mm256_storeu_pd(ptrD+4, _t8); \
678 _mm_store_sd(ptrA+8, _tA); \
679 _mm_store_sd(ptrB+8, _tB); \
680 _mm_store_sd(ptrC+8, _tC); \
681 _mm_store_sd(ptrD+8, _tD); \
684 /* Real function for sane compilers */
686 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
687 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
688 __m256d x1, __m256d y1, __m256d z1,
689 __m256d x2, __m256d y2, __m256d z2,
690 __m256d x3, __m256d y3, __m256d z3)
692 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
693 __m128d tA, tB, tC, tD, tE;
695 t1 = _mm256_loadu_pd(ptrA);
696 t2 = _mm256_loadu_pd(ptrB);
697 t3 = _mm256_loadu_pd(ptrC);
698 t4 = _mm256_loadu_pd(ptrD);
699 t5 = _mm256_loadu_pd(ptrA+4);
700 t6 = _mm256_loadu_pd(ptrB+4);
701 t7 = _mm256_loadu_pd(ptrC+4);
702 t8 = _mm256_loadu_pd(ptrD+4);
703 tA = _mm_load_sd(ptrA+8);
704 tB = _mm_load_sd(ptrB+8);
705 tC = _mm_load_sd(ptrC+8);
706 tD = _mm_load_sd(ptrD+8);
708 t9 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
709 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
711 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
712 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
714 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
715 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
717 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
718 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
720 t10 = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
721 y3 = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
723 t9 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
724 y1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
726 x1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
727 z1 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
729 x2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
730 z2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
732 t1 = _mm256_sub_pd(t1, t10);
733 t2 = _mm256_sub_pd(t2, t9);
734 t3 = _mm256_sub_pd(t3, y3);
735 t4 = _mm256_sub_pd(t4, y1);
736 t5 = _mm256_sub_pd(t5, x1);
737 t6 = _mm256_sub_pd(t6, x2);
738 t7 = _mm256_sub_pd(t7, z1);
739 t8 = _mm256_sub_pd(t8, z2);
741 tA = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
742 tB = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
743 tE = _mm256_extractf128_pd(z3, 0x1);
744 tC = _mm_sub_sd(tC, tE);
745 tD = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
747 /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
748 _mm256_storeu_pd(ptrA, t1);
749 _mm256_storeu_pd(ptrB, t2);
750 _mm256_storeu_pd(ptrC, t3);
751 _mm256_storeu_pd(ptrD, t4);
752 _mm256_storeu_pd(ptrA+4, t5);
753 _mm256_storeu_pd(ptrB+4, t6);
754 _mm256_storeu_pd(ptrC+4, t7);
755 _mm256_storeu_pd(ptrD+4, t8);
756 _mm_store_sd(ptrA+8, tA);
757 _mm_store_sd(ptrB+8, tB);
758 _mm_store_sd(ptrC+8, tC);
759 _mm_store_sd(ptrD+8, tD);
763 #if defined (_MSC_VER) && defined(_M_IX86)
764 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
765 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
766 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
768 __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
769 __m128d _tA, _tB, _tC, _tD, _tE; \
770 _t1 = _mm256_loadu_pd(ptrA); \
771 _t2 = _mm256_loadu_pd(ptrB); \
772 _t3 = _mm256_loadu_pd(ptrC); \
773 _t4 = _mm256_loadu_pd(ptrD); \
774 _t5 = _mm256_loadu_pd(ptrA+4); \
775 _t6 = _mm256_loadu_pd(ptrB+4); \
776 _t7 = _mm256_loadu_pd(ptrC+4); \
777 _t8 = _mm256_loadu_pd(ptrD+4); \
778 _t9 = _mm256_loadu_pd(ptrA+8); \
779 _t10 = _mm256_loadu_pd(ptrB+8); \
780 _t11 = _mm256_loadu_pd(ptrC+8); \
781 _t12 = _mm256_loadu_pd(ptrD+8); \
782 _t13 = _mm256_unpacklo_pd(_x1, _y1); \
783 _x1 = _mm256_unpackhi_pd(_x1, _y1); \
784 _y1 = _mm256_unpacklo_pd(_z1, _x2); \
785 _z1 = _mm256_unpackhi_pd(_z1, _x2); \
786 _x2 = _mm256_unpacklo_pd(_y2, _z2); \
787 _y2 = _mm256_unpackhi_pd(_y2, _z2); \
788 _z2 = _mm256_unpacklo_pd(_x3, _y3); \
789 _x3 = _mm256_unpackhi_pd(_x3, _y3); \
790 _y3 = _mm256_unpacklo_pd(_z3, _x4); \
791 _z3 = _mm256_unpackhi_pd(_z3, _x4); \
792 _x4 = _mm256_unpacklo_pd(_y4, _z4); \
793 _y4 = _mm256_unpackhi_pd(_y4, _z4); \
794 _z4 = gmx_mm256_unpack128lo_pd(_t13, _y1); \
795 _t13 = gmx_mm256_unpack128hi_pd(_t13, _y1); \
796 _y1 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
797 _x1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
798 _z1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
799 _x2 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
800 _z2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
801 _y2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
802 _x3 = gmx_mm256_unpack128lo_pd(_y3, _x4); \
803 _y3 = gmx_mm256_unpack128hi_pd(_y3, _x4); \
804 _x4 = gmx_mm256_unpack128lo_pd(_z3, _y4); \
805 _z3 = gmx_mm256_unpack128hi_pd(_z3, _y4); \
806 _t1 = _mm256_sub_pd(_t1, _z4); \
807 _t2 = _mm256_sub_pd(_t2, _y1); \
808 _t3 = _mm256_sub_pd(_t3, _t13); \
809 _t4 = _mm256_sub_pd(_t4, _x1); \
810 _t5 = _mm256_sub_pd(_t5, _z1); \
811 _t6 = _mm256_sub_pd(_t6, _z2); \
812 _t7 = _mm256_sub_pd(_t7, _x2); \
813 _t8 = _mm256_sub_pd(_t8, _y2); \
814 _t9 = _mm256_sub_pd(_t9, _x3); \
815 _t10 = _mm256_sub_pd(_t10, _x4); \
816 _t11 = _mm256_sub_pd(_t11, _y3); \
817 _t12 = _mm256_sub_pd(_t12, _z3); \
818 _mm256_storeu_pd(ptrA, _t1); \
819 _mm256_storeu_pd(ptrB, _t2); \
820 _mm256_storeu_pd(ptrC, _t3); \
821 _mm256_storeu_pd(ptrD, _t4); \
822 _mm256_storeu_pd(ptrA+4, _t5); \
823 _mm256_storeu_pd(ptrB+4, _t6); \
824 _mm256_storeu_pd(ptrC+4, _t7); \
825 _mm256_storeu_pd(ptrD+4, _t8); \
826 _mm256_storeu_pd(ptrA+8, _t9); \
827 _mm256_storeu_pd(ptrB+8, _t10); \
828 _mm256_storeu_pd(ptrC+8, _t11); \
829 _mm256_storeu_pd(ptrD+8, _t12); \
832 /* Real function for sane compilers */
834 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
835 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
836 __m256d x1, __m256d y1, __m256d z1,
837 __m256d x2, __m256d y2, __m256d z2,
838 __m256d x3, __m256d y3, __m256d z3,
839 __m256d x4, __m256d y4, __m256d z4)
841 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
842 __m128d tA, tB, tC, tD, tE;
844 t1 = _mm256_loadu_pd(ptrA);
845 t2 = _mm256_loadu_pd(ptrB);
846 t3 = _mm256_loadu_pd(ptrC);
847 t4 = _mm256_loadu_pd(ptrD);
848 t5 = _mm256_loadu_pd(ptrA+4);
849 t6 = _mm256_loadu_pd(ptrB+4);
850 t7 = _mm256_loadu_pd(ptrC+4);
851 t8 = _mm256_loadu_pd(ptrD+4);
852 t9 = _mm256_loadu_pd(ptrA+8);
853 t10 = _mm256_loadu_pd(ptrB+8);
854 t11 = _mm256_loadu_pd(ptrC+8);
855 t12 = _mm256_loadu_pd(ptrD+8);
857 t13 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
858 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
859 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
860 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
861 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
862 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
863 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
864 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
865 y3 = _mm256_unpacklo_pd(z3, x4); /* x4c z3c | x4a z3a */
866 z3 = _mm256_unpackhi_pd(z3, x4); /* x4d z3d | x4b z3b */
867 x4 = _mm256_unpacklo_pd(y4, z4); /* z4c y4c | z4a y4a */
868 y4 = _mm256_unpackhi_pd(y4, z4); /* z4d y4d | z4b y4b */
870 z4 = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
871 t13 = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
872 y1 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
873 x1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
874 z1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
875 x2 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
876 z2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
877 y2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
878 x3 = gmx_mm256_unpack128lo_pd(y3, x4); /* z4a y4a | x4a z3a */
879 y3 = gmx_mm256_unpack128hi_pd(y3, x4); /* z4c y4c | x4c z3c */
880 x4 = gmx_mm256_unpack128lo_pd(z3, y4); /* z4b y4b | x4b z3b */
881 z3 = gmx_mm256_unpack128hi_pd(z3, y4); /* z4d y4d | x4d z3d */
883 t1 = _mm256_sub_pd(t1, z4);
884 t2 = _mm256_sub_pd(t2, y1);
885 t3 = _mm256_sub_pd(t3, t13);
886 t4 = _mm256_sub_pd(t4, x1);
887 t5 = _mm256_sub_pd(t5, z1);
888 t6 = _mm256_sub_pd(t6, z2);
889 t7 = _mm256_sub_pd(t7, x2);
890 t8 = _mm256_sub_pd(t8, y2);
891 t9 = _mm256_sub_pd(t9, x3);
892 t10 = _mm256_sub_pd(t10, x4);
893 t11 = _mm256_sub_pd(t11, y3);
894 t12 = _mm256_sub_pd(t12, z3);
896 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
897 _mm256_storeu_pd(ptrA, t1);
898 _mm256_storeu_pd(ptrB, t2);
899 _mm256_storeu_pd(ptrC, t3);
900 _mm256_storeu_pd(ptrD, t4);
901 _mm256_storeu_pd(ptrA+4, t5);
902 _mm256_storeu_pd(ptrB+4, t6);
903 _mm256_storeu_pd(ptrC+4, t7);
904 _mm256_storeu_pd(ptrD+4, t8);
905 _mm256_storeu_pd(ptrA+8, t9);
906 _mm256_storeu_pd(ptrB+8, t10);
907 _mm256_storeu_pd(ptrC+8, t11);
908 _mm256_storeu_pd(ptrD+8, t12);
916 static gmx_inline void
917 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
918 double * gmx_restrict fptr,
919 double * gmx_restrict fshiftptr)
923 fix1 = _mm256_hadd_pd(fix1, fiy1);
924 fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
926 /* Add across the two lanes */
927 tA = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
928 tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
930 fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
932 t1 = _mm256_loadu_pd(fptr);
933 t2 = _mm256_loadu_pd(fshiftptr);
935 t1 = _mm256_add_pd(t1, fix1);
936 t2 = _mm256_add_pd(t2, fix1);
938 _mm256_storeu_pd(fptr, t1);
939 _mm256_storeu_pd(fshiftptr, t2);
944 #if defined (_MSC_VER) && defined(_M_IX86)
945 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
946 #define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
949 __m256d _t1, _t2, _t3, _t4; \
950 __m128d _tz3, _tA, _tB, _tC, _tD; \
951 fix1 = _mm256_hadd_pd(fix1, fiy1); \
952 fiz1 = _mm256_hadd_pd(fiz1, fix2); \
953 fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
954 fix3 = _mm256_hadd_pd(fix3, fiy3); \
955 fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
956 _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
957 _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
958 _t1 = _mm256_add_pd(_t1, _t2); \
959 _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
960 _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
961 _t3 = _mm256_add_pd(_t3, _t4); \
962 _tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
963 _t2 = _mm256_loadu_pd(fptr); \
964 _t4 = _mm256_loadu_pd(fptr+4); \
965 _tA = _mm_load_sd(fptr+8); \
966 _t2 = _mm256_add_pd(_t2, _t1); \
967 _t4 = _mm256_add_pd(_t4, _t3); \
968 _tA = _mm_add_sd(_tA, _tz3); \
969 _mm256_storeu_pd(fptr, _t2); \
970 _mm256_storeu_pd(fptr+4, _t4); \
971 _mm_store_sd(fptr+8, _tA); \
972 _tB = _mm256_extractf128_pd(_t1, 0x1); \
973 _tC = _mm256_extractf128_pd(_t3, 0x1); \
974 _tz3 = _mm_add_sd(_tz3, _tB); \
975 _tD = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
976 _tz3 = _mm_add_sd(_tz3, _tD); \
977 _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
978 _tD = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
979 _tC = _mm_add_pd(_tC, _tD); \
980 _tA = _mm_loadu_pd(fshiftptr); \
981 _tB = _mm_load_sd(fshiftptr+2); \
982 _tA = _mm_add_pd(_tA, _tC); \
983 _tB = _mm_add_sd(_tB, _tz3); \
984 _mm_storeu_pd(fshiftptr, _tA); \
985 _mm_store_sd(fshiftptr+2, _tB); \
988 /* Real function for sane compilers */
989 static gmx_inline void
990 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
991 __m256d fix2, __m256d fiy2, __m256d fiz2,
992 __m256d fix3, __m256d fiy3, __m256d fiz3,
993 double * gmx_restrict fptr,
994 double * gmx_restrict fshiftptr)
996 __m256d t1, t2, t3, t4;
997 __m128d tz3, tA, tB, tC, tD;
999 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
1000 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
1001 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
1002 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
1003 fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /* 0 Z3c-d | 0 Z3a-b */
1005 /* Add across the two lanes by swapping and adding back */
1006 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1007 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1008 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
1010 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1011 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1012 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
1014 tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
1016 t2 = _mm256_loadu_pd(fptr);
1017 t4 = _mm256_loadu_pd(fptr+4);
1018 tA = _mm_load_sd(fptr+8);
1020 t2 = _mm256_add_pd(t2, t1);
1021 t4 = _mm256_add_pd(t4, t3);
1022 tA = _mm_add_sd(tA, tz3);
1024 _mm256_storeu_pd(fptr, t2);
1025 _mm256_storeu_pd(fptr+4, t4);
1026 _mm_store_sd(fptr+8, tA);
1028 /* Add up shift force */
1029 /* t1: x2 z1 | y1 x1 */
1030 /* t3: y3 x3 | z2 y2 */
1034 tB = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
1035 tC = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
1036 tz3 = _mm_add_sd(tz3, tB); /* 0 z1+z3 */
1037 tD = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
1038 tz3 = _mm_add_sd(tz3, tD); /* - z */
1040 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
1042 tD = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
1043 tC = _mm_add_pd(tC, tD); /* y x */
1045 tA = _mm_loadu_pd(fshiftptr);
1046 tB = _mm_load_sd(fshiftptr+2);
1047 tA = _mm_add_pd(tA, tC);
1048 tB = _mm_add_sd(tB, tz3);
1049 _mm_storeu_pd(fshiftptr, tA);
1050 _mm_store_sd(fshiftptr+2, tB);
1055 #if defined (_MSC_VER) && defined(_M_IX86)
1056 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1057 #define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1060 __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
1061 __m128d _tA, _tB, _tC, _tD; \
1062 fix1 = _mm256_hadd_pd(fix1, fiy1); \
1063 fiz1 = _mm256_hadd_pd(fiz1, fix2); \
1064 fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
1065 fix3 = _mm256_hadd_pd(fix3, fiy3); \
1066 fiz3 = _mm256_hadd_pd(fiz3, fix4); \
1067 fiy4 = _mm256_hadd_pd(fiy4, fiz4); \
1068 _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
1069 _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
1070 _t1 = _mm256_add_pd(_t1, _t2); \
1071 _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
1072 _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
1073 _t3 = _mm256_add_pd(_t3, _t4); \
1074 _t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
1075 _t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
1076 _t5 = _mm256_add_pd(_t5, _t6); \
1077 _t2 = _mm256_loadu_pd(fptr); \
1078 _t4 = _mm256_loadu_pd(fptr+4); \
1079 _t6 = _mm256_loadu_pd(fptr+8); \
1080 _t2 = _mm256_add_pd(_t2, _t1); \
1081 _t4 = _mm256_add_pd(_t4, _t3); \
1082 _t6 = _mm256_add_pd(_t6, _t5); \
1083 _mm256_storeu_pd(fptr, _t2); \
1084 _mm256_storeu_pd(fptr+4, _t4); \
1085 _mm256_storeu_pd(fptr+8, _t6); \
1086 _tA = _mm256_extractf128_pd(_t1, 0x1); \
1087 _tB = _mm256_extractf128_pd(_t3, 0x1); \
1088 _tC = _mm256_extractf128_pd(_t5, 0x1); \
1089 _tB = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
1090 _tA = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
1091 _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
1092 _tD = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
1093 _tB = _mm_add_pd(_tB, _tD); \
1094 _tC = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
1095 _tC = _mm_add_sd(_tC, _tA); \
1096 _tA = _mm_loadu_pd(fshiftptr); \
1097 _tD = _mm_load_sd(fshiftptr+2); \
1098 _tA = _mm_add_pd(_tA, _tB); \
1099 _tD = _mm_add_sd(_tD, _tC); \
1100 _mm_storeu_pd(fshiftptr, _tA); \
1101 _mm_store_sd(fshiftptr+2, _tD); \
1104 /* Real function for sane compilers */
1105 static gmx_inline void
1106 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1107 __m256d fix2, __m256d fiy2, __m256d fiz2,
1108 __m256d fix3, __m256d fiy3, __m256d fiz3,
1109 __m256d fix4, __m256d fiy4, __m256d fiz4,
1110 double * gmx_restrict fptr,
1111 double * gmx_restrict fshiftptr)
1113 __m256d t1, t2, t3, t4, t5, t6;
1114 __m128d tA, tB, tC, tD;
1116 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
1117 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
1118 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
1119 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
1120 fiz3 = _mm256_hadd_pd(fiz3, fix4); /* X4c-d Z3c-d | X4a-b Z3a-b */
1121 fiy4 = _mm256_hadd_pd(fiy4, fiz4); /* Z4c-d Y4c-d | Z4a-b Y4a-b */
1123 /* Add across the two lanes by swapping and adding back */
1124 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1125 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1126 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
1128 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1129 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1130 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
1132 t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1133 t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1134 t5 = _mm256_add_pd(t5, t6); /* z4 y4 | x4 z3 */
1136 t2 = _mm256_loadu_pd(fptr);
1137 t4 = _mm256_loadu_pd(fptr+4);
1138 t6 = _mm256_loadu_pd(fptr+8);
1140 t2 = _mm256_add_pd(t2, t1);
1141 t4 = _mm256_add_pd(t4, t3);
1142 t6 = _mm256_add_pd(t6, t5);
1144 _mm256_storeu_pd(fptr, t2);
1145 _mm256_storeu_pd(fptr+4, t4);
1146 _mm256_storeu_pd(fptr+8, t6);
1148 /* Add up shift force */
1149 /* t1: x2. z1. | y1. x1. */
1150 /* t3: y3. x3. | z2 y2 */
1151 /* t5: z4 y4 | x4. z3. */
1154 tA = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
1155 tB = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
1156 tC = _mm256_extractf128_pd(t5, 0x1); /* z4 y4 */
1158 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
1159 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t5)); /* x2+x4 z1+z3 */
1160 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t3)); /* z4+z2 y4+y2 */
1162 tD = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1)); /* y4+y2 x2+x4 */
1163 tB = _mm_add_pd(tB, tD); /* y x */
1164 tC = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /* - z4+z2 */
1165 tC = _mm_add_sd(tC, tA); /* - z */
1167 tA = _mm_loadu_pd(fshiftptr);
1168 tD = _mm_load_sd(fshiftptr+2);
1169 tA = _mm_add_pd(tA, tB);
1170 tD = _mm_add_sd(tD, tC);
1171 _mm_storeu_pd(fshiftptr, tA);
1172 _mm_store_sd(fshiftptr+2, tD);
1179 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1183 pot1 = _mm256_hadd_pd(pot1, pot1);
1185 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1187 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1191 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1192 __m256d pot2, double * gmx_restrict ptrB)
1196 pot1 = _mm256_hadd_pd(pot1, pot2);
1198 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1200 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
1201 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1202 _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
1206 #endif /* _kernelutil_x86_avx_256_double_h_ */