2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_x86_avx_256_double_h_
30 #define _kernelutil_x86_avx_256_double_h_
33 #include "gmx_x86_avx_256.h"
37 gmx_mm256_any_lt(__m256d a, __m256d b)
39 return _mm256_movemask_pd(_mm256_cmp_pd(a,b,_CMP_LT_OQ));
42 static gmx_inline __m256d
43 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
45 return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx,dx), _mm256_mul_pd(dy,dy) ), _mm256_mul_pd(dz,dz) );
48 /* Normal sum of four ymm registers */
49 #define gmx_mm256_sum4_pd(t0,t1,t2,t3) _mm256_add_pd(_mm256_add_pd(t0,t1),_mm256_add_pd(t2,t3))
52 /* Load a single value from 1-4 places, merge into xmm register */
54 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
56 return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
60 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
61 const double * gmx_restrict ptrB)
65 tA = _mm_load_sd(ptrA);
66 tB = _mm_load_sd(ptrB);
68 return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA,tB));
73 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
74 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
78 t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
79 t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC),_mm_load_sd(ptrD));
80 return gmx_mm256_set_m128(t2,t1);
86 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
88 _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
93 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
97 t2 = _mm256_permute_pd(xmm1,_GMX_MM_PERMUTE256D(1,1,1,1));
98 _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
99 _mm_store_sd(ptrB,_mm256_castpd256_pd128(t2));
106 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
107 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
112 t2 = _mm256_permute_pd(xmm1,_GMX_MM_PERMUTE256D(1,1,1,1));
113 t3 = _mm256_extractf128_pd(xmm1,0x1);
114 t4 = _mm_permute_pd(t3,_GMX_MM_PERMUTE128D(1,1));
115 _mm_store_sd(ptrA,_mm256_castpd256_pd128(xmm1));
116 _mm_store_sd(ptrB,_mm256_castpd256_pd128(t2));
117 _mm_store_sd(ptrC,t3);
118 _mm_store_sd(ptrD,t4);
125 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
129 t1 = _mm256_castpd256_pd128(xmm1);
130 t1 = _mm_add_sd(t1,_mm_load_sd(ptrA));
132 _mm_store_sd(ptrA,t1);
137 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
141 t1 = _mm256_castpd256_pd128(xmm1);
142 t2 = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
144 t1 = _mm_add_sd(t1,_mm_load_sd(ptrA));
145 t2 = _mm_add_sd(t2,_mm_load_sd(ptrB));
147 _mm_store_sd(ptrA,t1);
148 _mm_store_sd(ptrB,t2);
153 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
154 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
158 t1 = _mm256_castpd256_pd128(xmm1);
159 t2 = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
160 t3 = _mm256_extractf128_pd(xmm1,0x1);
161 t4 = _mm_permute_pd(t3,_GMX_MM_PERMUTE128D(1,1));
163 t1 = _mm_add_sd(t1,_mm_load_sd(ptrA));
164 t2 = _mm_add_sd(t2,_mm_load_sd(ptrB));
165 t3 = _mm_add_sd(t3,_mm_load_sd(ptrC));
166 t4 = _mm_add_sd(t4,_mm_load_sd(ptrD));
168 _mm_store_sd(ptrA,t1);
169 _mm_store_sd(ptrB,t2);
170 _mm_store_sd(ptrC,t3);
171 _mm_store_sd(ptrD,t4);
177 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
179 *c6 = _mm256_castpd128_pd256(_mm_load_sd(p1));
180 *c12 = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
185 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
189 t1 = _mm_loadu_pd(p1);
190 t2 = _mm_loadu_pd(p2);
191 *c6 = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1,t2));
192 *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1,t2));
198 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
199 const double * gmx_restrict p3, const double * gmx_restrict p4,
200 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
204 t1 = gmx_mm256_set_m128(_mm_loadu_pd(p3),_mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
205 t2 = gmx_mm256_set_m128(_mm_loadu_pd(p4),_mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
207 *c6 = _mm256_unpacklo_pd(t1,t2); /* c6d c6c | c6b c6a */
208 *c12 = _mm256_unpackhi_pd(t1,t2); /* c12d c12c | c12b c12a */
212 static gmx_inline void
213 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
214 const double * gmx_restrict xyz,
215 __m256d * gmx_restrict x1,
216 __m256d * gmx_restrict y1,
217 __m256d * gmx_restrict z1)
219 __m128d mem_xy,mem_z,mem_sxy,mem_sz,tx,ty,tz;
221 mem_xy = _mm_loadu_pd(xyz);
222 mem_z = _mm_load_sd(xyz+2);
223 mem_sxy = _mm_loadu_pd(xyz_shift);
224 mem_sz = _mm_load_sd(xyz_shift+2);
226 mem_xy = _mm_add_pd(mem_xy,mem_sxy);
227 mem_z = _mm_add_pd(mem_z,mem_sz);
229 tx = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
230 ty = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
231 tz = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
233 *x1 = gmx_mm256_set_m128(tx,tx);
234 *y1 = gmx_mm256_set_m128(ty,ty);
235 *z1 = gmx_mm256_set_m128(tz,tz);
239 static gmx_inline void
240 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
241 const double * gmx_restrict xyz,
242 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
243 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
244 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
246 __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz,tx,ty,tz;
248 t1 = _mm_loadu_pd(xyz);
249 t2 = _mm_loadu_pd(xyz+2);
250 t3 = _mm_loadu_pd(xyz+4);
251 t4 = _mm_loadu_pd(xyz+6);
252 t5 = _mm_load_sd(xyz+8);
254 sxy = _mm_loadu_pd(xyz_shift);
255 sz = _mm_load_sd(xyz_shift+2);
256 szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
257 syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
259 t1 = _mm_add_pd(t1,sxy);
260 t2 = _mm_add_pd(t2,szx);
261 t3 = _mm_add_pd(t3,syz);
262 t4 = _mm_add_pd(t4,sxy);
263 t5 = _mm_add_sd(t5,sz);
265 tx = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
266 ty = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
267 tz = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
268 *x1 = gmx_mm256_set_m128(tx,tx);
269 *y1 = gmx_mm256_set_m128(ty,ty);
270 *z1 = gmx_mm256_set_m128(tz,tz);
271 tx = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
272 ty = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
273 tz = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
274 *x2 = gmx_mm256_set_m128(tx,tx);
275 *y2 = gmx_mm256_set_m128(ty,ty);
276 *z2 = gmx_mm256_set_m128(tz,tz);
277 tx = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
278 ty = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
279 tz = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
280 *x3 = gmx_mm256_set_m128(tx,tx);
281 *y3 = gmx_mm256_set_m128(ty,ty);
282 *z3 = gmx_mm256_set_m128(tz,tz);
286 static gmx_inline void
287 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
288 const double * gmx_restrict xyz,
289 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
290 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
291 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
292 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
294 __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz,tx,ty,tz;
296 t1 = _mm_loadu_pd(xyz);
297 t2 = _mm_loadu_pd(xyz+2);
298 t3 = _mm_loadu_pd(xyz+4);
299 t4 = _mm_loadu_pd(xyz+6);
300 t5 = _mm_loadu_pd(xyz+8);
301 t6 = _mm_loadu_pd(xyz+10);
303 sxy = _mm_loadu_pd(xyz_shift);
304 sz = _mm_load_sd(xyz_shift+2);
305 szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
306 syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
308 t1 = _mm_add_pd(t1,sxy);
309 t2 = _mm_add_pd(t2,szx);
310 t3 = _mm_add_pd(t3,syz);
311 t4 = _mm_add_pd(t4,sxy);
312 t5 = _mm_add_pd(t5,szx);
313 t6 = _mm_add_pd(t6,syz);
315 tx = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
316 ty = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
317 tz = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
318 *x1 = gmx_mm256_set_m128(tx,tx);
319 *y1 = gmx_mm256_set_m128(ty,ty);
320 *z1 = gmx_mm256_set_m128(tz,tz);
321 tx = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
322 ty = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
323 tz = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
324 *x2 = gmx_mm256_set_m128(tx,tx);
325 *y2 = gmx_mm256_set_m128(ty,ty);
326 *z2 = gmx_mm256_set_m128(tz,tz);
327 tx = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
328 ty = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
329 tz = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
330 *x3 = gmx_mm256_set_m128(tx,tx);
331 *y3 = gmx_mm256_set_m128(ty,ty);
332 *z3 = gmx_mm256_set_m128(tz,tz);
333 tx = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
334 ty = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
335 tz = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
336 *x4 = gmx_mm256_set_m128(tx,tx);
337 *y4 = gmx_mm256_set_m128(ty,ty);
338 *z4 = gmx_mm256_set_m128(tz,tz);
343 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
344 __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
348 t1 = _mm256_loadu_pd(p1);
350 *y = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
351 *z = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
356 gmx_mm256_load_2rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
357 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
358 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
362 t1 = _mm256_loadu_pd(p1); /* x2 z1 | y1 x1 */
363 t2 = _mm256_castpd128_pd256(_mm_loadu_pd(p1+4)); /* - - | z2 y2 */
368 t3 = gmx_mm256_unpack128hi_pd(t1,t1);
371 *y1 = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
372 *z2 = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
373 *x2 = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
377 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
378 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
379 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
380 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
384 t1 = _mm256_loadu_pd(p1);
385 t3 = _mm256_loadu_pd(p1+4);
388 t2 = gmx_mm256_unpack128hi_pd(t1,t1);
389 t4 = gmx_mm256_unpack128hi_pd(t3,t3);
392 *y1 = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
393 *z2 = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
394 *x2 = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
395 *y3 = _mm256_permute_pd(t4,_GMX_MM_PERMUTE256D(0,1,0,1));
396 *z3 = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
400 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
401 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
402 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
403 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
404 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
406 __m256d t1,t2,t3,t4,t5,t6;
408 t1 = _mm256_loadu_pd(p1);
409 t2 = _mm256_loadu_pd(p1+4);
410 t3 = _mm256_loadu_pd(p1+8);
412 t4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
413 t5 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));
414 t6 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
423 *y1 = _mm256_permute_pd(t1,_GMX_MM_PERMUTE256D(0,1,0,1));
424 *z2 = _mm256_permute_pd(t2,_GMX_MM_PERMUTE256D(0,1,0,1));
425 *x4 = _mm256_permute_pd(t3,_GMX_MM_PERMUTE256D(0,1,0,1));
426 *x2 = _mm256_permute_pd(t4,_GMX_MM_PERMUTE256D(0,1,0,1));
427 *y3 = _mm256_permute_pd(t5,_GMX_MM_PERMUTE256D(0,1,0,1));
428 *z4 = _mm256_permute_pd(t6,_GMX_MM_PERMUTE256D(0,1,0,1));
433 gmx_mm256_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
434 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
438 tA = _mm256_loadu_pd(ptrA); /* - z1 | y1 x1 */
439 tB = _mm256_loadu_pd(ptrB); /* - z2 | y2 x2 */
441 tC = _mm256_unpacklo_pd(tA,tB); /* z2 z1 | x2 x1 */
444 *y1 = _mm256_unpackhi_pd(tA,tB);
445 *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(tC,0x1));
450 gmx_mm256_load_2rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
451 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
452 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
454 __m256d t1,t2,t3,t4,t5;
456 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
457 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
458 t3 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4)); /* - - | z2a y2a */
459 t4 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4)); /* - - | z2b y2b */
461 t5 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
462 t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
463 *y2 = _mm256_unpacklo_pd(t3,t4); /* - - | y2b y2a */
464 *z2 = _mm256_unpackhi_pd(t3,t4); /* - - | z2b z2a */
467 *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));;
468 *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
473 gmx_mm256_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
474 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
475 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
476 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
478 __m256d t1,t2,t3,t4,t5,t6,t7;
480 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
481 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
482 t3 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
483 t4 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
484 t5 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
485 t6 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
487 t7 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
488 t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
490 t2 = _mm256_unpacklo_pd(t3,t4); /* x3b x3a | y2b y2a */
491 t3 = _mm256_unpackhi_pd(t3,t4); /* y3b y3a | z2b z2a */
493 *z3 = _mm256_unpacklo_pd(t5,t6); /* - - | z3b z3a */
499 *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
500 *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
501 *x3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
502 *y3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
507 gmx_mm256_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
508 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
509 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
510 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
511 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
513 __m256d t1,t2,t3,t4,t5,t6,t7;
515 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
516 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
517 t3 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
518 t4 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
519 t5 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
520 t6 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
522 t7 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
523 t1 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
525 t2 = _mm256_unpacklo_pd(t3,t4); /* x3b x3a | y2b y2a */
526 t3 = _mm256_unpackhi_pd(t3,t4); /* y3b y3a | z2b z2a */
528 t4 = _mm256_unpacklo_pd(t5,t6); /* y4b y4a | z3b z3a */
529 t5 = _mm256_unpackhi_pd(t5,t6); /* z4b z4a | x4b x4a */
538 *z1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t7,0x1));;
539 *x2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1,0x1));
540 *x3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2,0x1));;
541 *y3 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3,0x1));
542 *y4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t4,0x1));;
543 *z4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t5,0x1));
549 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
550 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
551 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
553 __m256d t1,t2,t3,t4,t5,t6;
555 t1 = _mm256_loadu_pd(ptrA); /* - z1a | y1a x1a */
556 t2 = _mm256_loadu_pd(ptrB); /* - z1b | y1b x1b */
557 t3 = _mm256_loadu_pd(ptrC); /* - z1c | y1c x1c */
558 t4 = _mm256_loadu_pd(ptrD); /* - z1d | y1d x1d */
560 t5 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
561 t6 = _mm256_unpackhi_pd(t1,t2); /* - - | y1b y1a */
562 t1 = _mm256_unpacklo_pd(t3,t4); /* z1c z1c | x1d x1c */
563 t2 = _mm256_unpackhi_pd(t3,t4); /* - - | y1d y1c */
565 *x1 = gmx_mm256_unpack128lo_pd(t5,t1);
566 *y1 = gmx_mm256_unpack128lo_pd(t6,t2);
567 *z1 = gmx_mm256_unpack128hi_pd(t5,t1);
571 gmx_mm256_load_2rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
572 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
573 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
574 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2)
576 __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
578 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
579 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
580 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
581 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
582 t5 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrA+4)); /* - - | z2a y2a */
583 t6 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrB+4)); /* - - | z2b y2b */
584 t7 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrC+4)); /* - - | z2c y2c */
585 t8 = _mm256_castpd128_pd256(_mm_loadu_pd(ptrD+4)); /* - - | z2d y2d */
587 t9 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
588 t10 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
589 t1 = _mm256_unpacklo_pd(t3,t4); /* z1d z1c | x1d x1c */
590 t2 = _mm256_unpackhi_pd(t3,t4); /* x2d x2c | y1d y1c */
591 t3 = _mm256_unpacklo_pd(t5,t6); /* - - | y2b y2a */
592 t4 = _mm256_unpackhi_pd(t5,t6); /* - - | z2b z2a */
593 t5 = _mm256_unpacklo_pd(t7,t8); /* - - | y2d y2c */
594 t6 = _mm256_unpackhi_pd(t7,t8); /* - - | z2d z2c */
596 *x1 = gmx_mm256_unpack128lo_pd(t9,t1);
597 *y1 = gmx_mm256_unpack128lo_pd(t10,t2);
598 *z1 = gmx_mm256_unpack128hi_pd(t9,t1);
600 *x2 = gmx_mm256_unpack128hi_pd(t10,t2);
601 *y2 = gmx_mm256_unpack128lo_pd(t3,t5);
602 *z2 = gmx_mm256_unpack128lo_pd(t4,t6);
607 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
608 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
609 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
610 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
611 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
613 __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
615 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
616 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
617 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
618 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
619 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
620 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
621 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
622 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
623 t9 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
624 t10 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
625 t11 = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /* - - | - z3c */
626 t12 = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /* - - | - z3d */
628 t13 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
629 t14 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
630 t1 = _mm256_unpacklo_pd(t3,t4); /* z1d z1c | x1d x1c */
631 t2 = _mm256_unpackhi_pd(t3,t4); /* x2d x2c | y1d y1c */
633 t3 = _mm256_unpacklo_pd(t5,t6); /* x3b x3a | y2b y2a */
634 t4 = _mm256_unpackhi_pd(t5,t6); /* y3b y3a | z2b z2a */
635 t5 = _mm256_unpacklo_pd(t7,t8); /* x3d x3c | y2d y2c */
636 t6 = _mm256_unpackhi_pd(t7,t8); /* y3d y3c | z2d z2c */
638 t9 = _mm256_unpacklo_pd(t9,t10); /* - - | z3b z3a */
639 t11 = _mm256_unpacklo_pd(t11,t12); /* - - | z3d z3c */
641 *x1 = gmx_mm256_unpack128lo_pd(t13,t1);
642 *y1 = gmx_mm256_unpack128lo_pd(t14,t2);
643 *z1 = gmx_mm256_unpack128hi_pd(t13,t1);
644 *x2 = gmx_mm256_unpack128hi_pd(t14,t2);
645 *y2 = gmx_mm256_unpack128lo_pd(t3,t5);
646 *z2 = gmx_mm256_unpack128lo_pd(t4,t6);
647 *x3 = gmx_mm256_unpack128hi_pd(t3,t5);
648 *y3 = gmx_mm256_unpack128hi_pd(t4,t6);
649 *z3 = gmx_mm256_unpack128lo_pd(t9,t11);
655 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
656 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
657 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
658 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
659 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
660 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
662 __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
664 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
665 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
666 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
667 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
668 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
669 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
670 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
671 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
672 t9 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
673 t10 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
674 t11 = _mm256_loadu_pd(ptrC+8); /* z4c y4c | x4c z3c */
675 t12 = _mm256_loadu_pd(ptrD+8); /* z4d y4d | x4d z3d */
677 t13 = _mm256_unpacklo_pd(t1,t2); /* z1b z1a | x1b x1a */
678 t14 = _mm256_unpackhi_pd(t1,t2); /* x2b x2a | y1b y1a */
679 t1 = _mm256_unpacklo_pd(t3,t4); /* z1d z1c | x1d x1c */
680 t2 = _mm256_unpackhi_pd(t3,t4); /* x2d x2c | y1d y1c */
682 t3 = _mm256_unpacklo_pd(t5,t6); /* x3b x3a | y2b y2a */
683 t4 = _mm256_unpackhi_pd(t5,t6); /* y3b y3a | z2b z2a */
684 t5 = _mm256_unpacklo_pd(t7,t8); /* x3d x3c | y2d y2c */
685 t6 = _mm256_unpackhi_pd(t7,t8); /* y3d y3c | z2d z2c */
687 t7 = _mm256_unpacklo_pd(t9,t10); /* y4b y4a | z3b z3a */
688 t8 = _mm256_unpackhi_pd(t9,t10); /* z4b z4a | x4b x4a */
689 t9 = _mm256_unpacklo_pd(t11,t12); /* y4d y4c | z3d z3c */
690 t10 = _mm256_unpackhi_pd(t11,t12); /* z4d z4c | x4d x4c */
692 *x1 = gmx_mm256_unpack128lo_pd(t13,t1);
693 *y1 = gmx_mm256_unpack128lo_pd(t14,t2);
694 *z1 = gmx_mm256_unpack128hi_pd(t13,t1);
695 *x2 = gmx_mm256_unpack128hi_pd(t14,t2);
696 *y2 = gmx_mm256_unpack128lo_pd(t3,t5);
697 *z2 = gmx_mm256_unpack128lo_pd(t4,t6);
698 *x3 = gmx_mm256_unpack128hi_pd(t3,t5);
699 *y3 = gmx_mm256_unpack128hi_pd(t4,t6);
700 *z3 = gmx_mm256_unpack128lo_pd(t7,t9);
701 *x4 = gmx_mm256_unpack128lo_pd(t8,t10);
702 *y4 = gmx_mm256_unpack128hi_pd(t7,t9);
703 *z4 = gmx_mm256_unpack128hi_pd(t8,t10);
708 /* Routines to decrement rvec in memory, typically use for j particle force updates */
710 gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
714 t1 = _mm256_loadu_pd(ptrA);
715 t2 = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
716 t1 = _mm256_sub_pd(t1,t2);
717 /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
718 _mm256_storeu_pd(ptrA,t1);
724 gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
725 __m256d xyz1, __m256d xyz2, __m256d xyz3)
731 tA = _mm256_loadu_pd(ptrA);
732 tB = _mm256_loadu_pd(ptrA+4);
733 tC = _mm_load_sd(ptrA+8);
735 /* xyz1: - z1 | y1 x1 */
736 /* xyz2: - z2 | y2 x2 */
737 /* xyz3: - z3 | y3 x3 */
739 xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
740 t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
741 xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
742 xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
743 t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
744 xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
746 tA = _mm256_sub_pd(tA,xyz1);
747 tB = _mm256_sub_pd(tB,xyz2);
748 tC = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
750 _mm256_storeu_pd(ptrA,tA);
751 _mm256_storeu_pd(ptrA+4,tB);
752 _mm_store_sd(ptrA+8,tC);
756 gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
757 __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
762 tA = _mm256_loadu_pd(ptrA);
763 tB = _mm256_loadu_pd(ptrA+4);
764 tC = _mm256_loadu_pd(ptrA+8);
766 /* xyz1: - z1 | y1 x1 */
767 /* xyz2: - z2 | y2 x2 */
768 /* xyz3: - z3 | y3 x3 */
769 /* xyz4: - z4 | y4 x4 */
771 xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
772 t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
773 xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
774 xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
775 t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
776 xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
777 xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z4 - | x4 y4 */
778 t3 = _mm256_permute2f128_pd(xyz4,xyz4,0x21); /* x4 y4 | z4 - */
779 t3 = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
780 xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /* xz y4 | x4 z3 */
782 tA = _mm256_sub_pd(tA,xyz1);
783 tB = _mm256_sub_pd(tB,xyz2);
784 tC = _mm256_sub_pd(tC,xyz4);
786 _mm256_storeu_pd(ptrA,tA);
787 _mm256_storeu_pd(ptrA+4,tB);
788 _mm256_storeu_pd(ptrA+8,tC);
794 gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
795 __m256d x1, __m256d y1, __m256d z1)
799 t1 = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
800 t2 = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
801 t3 = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
802 _mm_store_sd(ptrA,t1);
803 _mm_store_sd(ptrA+1,t2);
804 _mm_store_sd(ptrA+2,t3);
809 gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
810 __m256d x1, __m256d y1, __m256d z1,
811 __m256d x2, __m256d y2, __m256d z2)
815 t1 = _mm256_loadu_pd(ptrA);
816 tA = _mm_loadu_pd(ptrA+4);
818 x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
819 z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
820 y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
822 x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
824 t1 = _mm256_sub_pd(x1,t1);
825 tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
827 _mm256_storeu_pd(ptrA,t1);
828 _mm_storeu_pd(ptrA+4,tA);
833 gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
834 __m256d x1, __m256d y1, __m256d z1,
835 __m256d x2, __m256d y2, __m256d z2,
836 __m256d x3, __m256d y3, __m256d z3)
841 t1 = _mm256_loadu_pd(ptrA);
842 t2 = _mm256_loadu_pd(ptrA+4);
843 tA = _mm_load_sd(ptrA+8);
845 x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
846 z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
847 y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
848 x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
850 x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
851 y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
852 t1 = _mm256_sub_pd(t1,x1);
853 t2 = _mm256_sub_pd(t2,y2);
854 tA = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
856 _mm256_storeu_pd(ptrA,t1);
857 _mm256_storeu_pd(ptrA+4,t2);
858 _mm_store_sd(ptrA+8,tA);
863 gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
864 __m256d x1, __m256d y1, __m256d z1,
865 __m256d x2, __m256d y2, __m256d z2,
866 __m256d x3, __m256d y3, __m256d z3,
867 __m256d x4, __m256d y4, __m256d z4)
871 t1 = _mm256_loadu_pd(ptrA);
872 t2 = _mm256_loadu_pd(ptrA+4);
873 t3 = _mm256_loadu_pd(ptrA+8);
875 x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
876 z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
877 y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
878 x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
879 z3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
880 y4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
882 x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
883 y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
884 z3 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
886 t1 = _mm256_sub_pd(t1,x1);
887 t2 = _mm256_sub_pd(t2,y2);
888 t3 = _mm256_sub_pd(t3,z3);
890 _mm256_storeu_pd(ptrA,t1);
891 _mm256_storeu_pd(ptrA+4,t2);
892 _mm256_storeu_pd(ptrA+8,t3);
896 gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
897 double * gmx_restrict ptrB,
898 __m256d x1, __m256d y1, __m256d z1)
903 t3 = _mm256_loadu_pd(ptrA);
904 t4 = _mm256_loadu_pd(ptrB);
906 t1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
907 t2 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
909 t1 = gmx_mm256_unpack128lo_pd(t1,z1); /* - z1a | y1a x1a */
910 z1 = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
911 t2 = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
913 /* Construct a mask without executing any data loads */
914 mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
915 _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
917 t3 = _mm256_sub_pd(t3,t1);
918 t4 = _mm256_sub_pd(t4,t2);
920 /* Careful with potentially overlapping stores, need to be masked */
921 _mm256_maskstore_pd(ptrA,mask,t3);
922 _mm256_maskstore_pd(ptrB,mask,t4);
926 gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
927 __m256d x1, __m256d y1, __m256d z1,
928 __m256d x2, __m256d y2, __m256d z2)
933 t1 = _mm256_loadu_pd(ptrA);
934 t2 = _mm256_loadu_pd(ptrB);
935 t3 = _mm_loadu_pd(ptrA+4);
936 t4 = _mm_loadu_pd(ptrB+4);
938 t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
939 x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
941 y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
942 z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
944 x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
945 y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
947 z2 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
948 y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
950 t1 = _mm256_sub_pd(t1,z2);
951 t2 = _mm256_sub_pd(t2,y1);
952 t3 = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
953 t4 = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
955 /* Careful with potentially overlapping stores, need to be masked */
956 _mm256_storeu_pd(ptrA,t1);
957 _mm256_storeu_pd(ptrB,t2);
958 _mm_storeu_pd(ptrA+4,t3);
959 _mm_storeu_pd(ptrB+4,t4);
963 gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
964 __m256d x1, __m256d y1, __m256d z1,
965 __m256d x2, __m256d y2, __m256d z2,
966 __m256d x3, __m256d y3, __m256d z3)
968 __m256d t1,t2,t3,t4,t5,t6;
971 t1 = _mm256_loadu_pd(ptrA);
972 t2 = _mm256_loadu_pd(ptrB);
973 t3 = _mm256_loadu_pd(ptrA+4);
974 t4 = _mm256_loadu_pd(ptrB+4);
975 tA = _mm_load_sd(ptrA+8);
976 tB = _mm_load_sd(ptrB+8);
978 t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
979 x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
981 y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
982 z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
984 x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
985 y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
987 z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
988 x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
990 t6 = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
992 y3 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
993 y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
995 t5 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
996 x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
998 t1 = _mm256_sub_pd(t1,y3);
999 t2 = _mm256_sub_pd(t2,y1);
1000 t3 = _mm256_sub_pd(t3,t5);
1001 t4 = _mm256_sub_pd(t4,x1);
1002 tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
1003 tB = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
1005 _mm256_storeu_pd(ptrA,t1);
1006 _mm256_storeu_pd(ptrB,t2);
1007 _mm256_storeu_pd(ptrA+4,t3);
1008 _mm256_storeu_pd(ptrB+4,t4);
1009 _mm_store_sd(ptrA+8,tA);
1010 _mm_store_sd(ptrB+8,tB);
1015 gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1016 __m256d x1, __m256d y1, __m256d z1,
1017 __m256d x2, __m256d y2, __m256d z2,
1018 __m256d x3, __m256d y3, __m256d z3,
1019 __m256d x4, __m256d y4, __m256d z4)
1021 __m256d t1,t2,t3,t4,t5,t6,t7;
1023 t1 = _mm256_loadu_pd(ptrA);
1024 t2 = _mm256_loadu_pd(ptrB);
1025 t3 = _mm256_loadu_pd(ptrA+4);
1026 t4 = _mm256_loadu_pd(ptrB+4);
1027 t5 = _mm256_loadu_pd(ptrA+8);
1028 t6 = _mm256_loadu_pd(ptrB+8);
1030 t7 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
1031 x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
1033 y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
1034 z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
1036 x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
1037 y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
1039 z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
1040 x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
1042 y3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
1043 z3 = _mm256_unpackhi_pd(z3,x4); /* - - | x4b z3b */
1044 x4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
1045 y4 = _mm256_unpackhi_pd(y4,z4); /* - - | z4b y4b */
1047 z4 = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
1048 y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
1050 t7 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
1051 x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
1053 x2 = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
1054 y2 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
1056 t1 = _mm256_sub_pd(t1,z4);
1057 t2 = _mm256_sub_pd(t2,y1);
1058 t3 = _mm256_sub_pd(t3,t7);
1059 t4 = _mm256_sub_pd(t4,x1);
1060 t5 = _mm256_sub_pd(t5,x2);
1061 t6 = _mm256_sub_pd(t6,y2);
1063 _mm256_storeu_pd(ptrA,t1);
1064 _mm256_storeu_pd(ptrB,t2);
1065 _mm256_storeu_pd(ptrA+4,t3);
1066 _mm256_storeu_pd(ptrB+4,t4);
1067 _mm256_storeu_pd(ptrA+8,t5);
1068 _mm256_storeu_pd(ptrB+8,t6);
1074 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1075 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1076 __m256d x1, __m256d y1, __m256d z1)
1078 __m256d t1,t2,tA,tB,tC,tD;
1081 t1 = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
1082 t2 = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
1083 x1 = gmx_mm256_unpack128lo_pd(t1,z1); /* - z1a | y1a x1a */
1084 y1 = gmx_mm256_unpack128hi_pd(t1,z1); /* - z1c | y1c x1c */
1085 z1 = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(0,1,0,1));
1086 t1 = gmx_mm256_unpack128lo_pd(t2,z1); /* - z1b | y1b x1b */
1087 z1 = gmx_mm256_unpack128hi_pd(t2,z1); /* - z1d | y1d x1d */
1089 /* Construct a mask without executing any data loads */
1090 mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
1091 _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
1093 tA = _mm256_loadu_pd(ptrA);
1094 tB = _mm256_loadu_pd(ptrB);
1095 tC = _mm256_loadu_pd(ptrC);
1096 tD = _mm256_loadu_pd(ptrD);
1098 tA = _mm256_sub_pd(tA,x1);
1099 tB = _mm256_sub_pd(tB,t1);
1100 tC = _mm256_sub_pd(tC,y1);
1101 tD = _mm256_sub_pd(tD,z1);
1103 _mm256_maskstore_pd(ptrA,mask,tA);
1104 _mm256_maskstore_pd(ptrB,mask,tB);
1105 _mm256_maskstore_pd(ptrC,mask,tC);
1106 _mm256_maskstore_pd(ptrD,mask,tD);
1110 gmx_mm256_decrement_2rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1111 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1112 __m256d x1, __m256d y1, __m256d z1,
1113 __m256d x2, __m256d y2, __m256d z2)
1115 __m256d t1,t2,t3,t4,t5,t6;
1116 __m128d tA,tB,tC,tD,tE,tF;
1118 t1 = _mm256_loadu_pd(ptrA);
1119 t2 = _mm256_loadu_pd(ptrB);
1120 t3 = _mm256_loadu_pd(ptrC);
1121 t4 = _mm256_loadu_pd(ptrD);
1122 tA = _mm_loadu_pd(ptrA+4);
1123 tB = _mm_loadu_pd(ptrB+4);
1124 tC = _mm_loadu_pd(ptrC+4);
1125 tD = _mm_loadu_pd(ptrD+4);
1127 t5 = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
1128 x1 = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
1129 y1 = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
1130 z1 = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
1131 x2 = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
1132 y2 = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
1134 t6 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
1135 z2 = gmx_mm256_unpack128hi_pd(t5,y1); /* x2c z1c | y1c x1c */
1136 t5 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
1137 y1 = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
1139 tE = _mm256_extractf128_pd(x2,0x1); /* z2c y2c */
1140 tF = _mm256_extractf128_pd(y2,0x1); /* z2d y2d */
1142 t1 = _mm256_sub_pd(t1,t6);
1143 t2 = _mm256_sub_pd(t2,t5);
1144 t3 = _mm256_sub_pd(t3,z2);
1145 t4 = _mm256_sub_pd(t4,y1);
1146 tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(x2));
1147 tB = _mm_sub_pd(tB,_mm256_castpd256_pd128(y2));
1148 tC = _mm_sub_pd(tC,tE);
1149 tD = _mm_sub_pd(tD,tF);
1151 _mm256_storeu_pd(ptrA,t1);
1152 _mm256_storeu_pd(ptrB,t2);
1153 _mm256_storeu_pd(ptrC,t3);
1154 _mm256_storeu_pd(ptrD,t4);
1155 _mm_storeu_pd(ptrA+4,tA);
1156 _mm_storeu_pd(ptrB+4,tB);
1157 _mm_storeu_pd(ptrC+4,tC);
1158 _mm_storeu_pd(ptrD+4,tD);
1163 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1164 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1165 __m256d x1, __m256d y1, __m256d z1,
1166 __m256d x2, __m256d y2, __m256d z2,
1167 __m256d x3, __m256d y3, __m256d z3)
1169 __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
1170 __m128d tA,tB,tC,tD,tE;
1172 t1 = _mm256_loadu_pd(ptrA);
1173 t2 = _mm256_loadu_pd(ptrB);
1174 t3 = _mm256_loadu_pd(ptrC);
1175 t4 = _mm256_loadu_pd(ptrD);
1176 t5 = _mm256_loadu_pd(ptrA+4);
1177 t6 = _mm256_loadu_pd(ptrB+4);
1178 t7 = _mm256_loadu_pd(ptrC+4);
1179 t8 = _mm256_loadu_pd(ptrD+4);
1180 tA = _mm_load_sd(ptrA+8);
1181 tB = _mm_load_sd(ptrB+8);
1182 tC = _mm_load_sd(ptrC+8);
1183 tD = _mm_load_sd(ptrD+8);
1185 t9 = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
1186 x1 = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
1188 y1 = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
1189 z1 = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
1191 x2 = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
1192 y2 = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
1194 z2 = _mm256_unpacklo_pd(x3,y3); /* y3c x3c | y3a x3a */
1195 x3 = _mm256_unpackhi_pd(x3,y3); /* y3d x3d | y3b x3b */
1197 t10 = gmx_mm256_unpack128lo_pd(t9,y1); /* x2a z1a | y1a x1a */
1198 y3 = gmx_mm256_unpack128hi_pd(t9,y1); /* x2c z1c | y1c x1c */
1200 t9 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
1201 y1 = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
1203 x1 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
1204 z1 = gmx_mm256_unpack128hi_pd(x2,z2); /* y3c x3c | z2c y2c */
1206 x2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
1207 z2 = gmx_mm256_unpack128hi_pd(y2,x3); /* y3d x3d | z2d y2d */
1209 t1 = _mm256_sub_pd(t1,t10);
1210 t2 = _mm256_sub_pd(t2,t9);
1211 t3 = _mm256_sub_pd(t3,y3);
1212 t4 = _mm256_sub_pd(t4,y1);
1213 t5 = _mm256_sub_pd(t5,x1);
1214 t6 = _mm256_sub_pd(t6,x2);
1215 t7 = _mm256_sub_pd(t7,z1);
1216 t8 = _mm256_sub_pd(t8,z2);
1218 tA = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
1219 tB = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3),_GMX_MM_PERMUTE128D(1,1)));
1220 tE = _mm256_extractf128_pd(z3,0x1);
1221 tC = _mm_sub_sd(tC, tE);
1222 tD = _mm_sub_sd(tD, _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1)));
1224 /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
1225 _mm256_storeu_pd(ptrA,t1);
1226 _mm256_storeu_pd(ptrB,t2);
1227 _mm256_storeu_pd(ptrC,t3);
1228 _mm256_storeu_pd(ptrD,t4);
1229 _mm256_storeu_pd(ptrA+4,t5);
1230 _mm256_storeu_pd(ptrB+4,t6);
1231 _mm256_storeu_pd(ptrC+4,t7);
1232 _mm256_storeu_pd(ptrD+4,t8);
1233 _mm_store_sd(ptrA+8,tA);
1234 _mm_store_sd(ptrB+8,tB);
1235 _mm_store_sd(ptrC+8,tC);
1236 _mm_store_sd(ptrD+8,tD);
1241 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
1242 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
1243 __m256d x1, __m256d y1, __m256d z1,
1244 __m256d x2, __m256d y2, __m256d z2,
1245 __m256d x3, __m256d y3, __m256d z3,
1246 __m256d x4, __m256d y4, __m256d z4)
1248 __m256d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;
1249 __m128d tA,tB,tC,tD,tE;
1251 t1 = _mm256_loadu_pd(ptrA);
1252 t2 = _mm256_loadu_pd(ptrB);
1253 t3 = _mm256_loadu_pd(ptrC);
1254 t4 = _mm256_loadu_pd(ptrD);
1255 t5 = _mm256_loadu_pd(ptrA+4);
1256 t6 = _mm256_loadu_pd(ptrB+4);
1257 t7 = _mm256_loadu_pd(ptrC+4);
1258 t8 = _mm256_loadu_pd(ptrD+4);
1259 t9 = _mm256_loadu_pd(ptrA+8);
1260 t10 = _mm256_loadu_pd(ptrB+8);
1261 t11 = _mm256_loadu_pd(ptrC+8);
1262 t12 = _mm256_loadu_pd(ptrD+8);
1264 t13 = _mm256_unpacklo_pd(x1,y1); /* y1c x1c | y1a x1a */
1265 x1 = _mm256_unpackhi_pd(x1,y1); /* y1d x1d | y1b x1b */
1266 y1 = _mm256_unpacklo_pd(z1,x2); /* x2c z1c | x2a z1a */
1267 z1 = _mm256_unpackhi_pd(z1,x2); /* x2d z1d | x2b z1b */
1268 x2 = _mm256_unpacklo_pd(y2,z2); /* z2c y2c | z2a y2a */
1269 y2 = _mm256_unpackhi_pd(y2,z2); /* z2d y2d | z2b y2b */
1270 z2 = _mm256_unpacklo_pd(x3,y3); /* y3c x3c | y3a x3a */
1271 x3 = _mm256_unpackhi_pd(x3,y3); /* y3d x3d | y3b x3b */
1272 y3 = _mm256_unpacklo_pd(z3,x4); /* x4c z3c | x4a z3a */
1273 z3 = _mm256_unpackhi_pd(z3,x4); /* x4d z3d | x4b z3b */
1274 x4 = _mm256_unpacklo_pd(y4,z4); /* z4c y4c | z4a y4a */
1275 y4 = _mm256_unpackhi_pd(y4,z4); /* z4d y4d | z4b y4b */
1277 z4 = gmx_mm256_unpack128lo_pd(t13,y1); /* x2a z1a | y1a x1a */
1278 t13 = gmx_mm256_unpack128hi_pd(t13,y1); /* x2c z1c | y1c x1c */
1279 y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
1280 x1 = gmx_mm256_unpack128hi_pd(x1,z1); /* x2d z1d | y1d x1d */
1281 z1 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
1282 x2 = gmx_mm256_unpack128hi_pd(x2,z2); /* y3c x3c | z2c y2c */
1283 z2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
1284 y2 = gmx_mm256_unpack128hi_pd(y2,x3); /* y3d x3d | z2d y2d */
1285 x3 = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
1286 y3 = gmx_mm256_unpack128hi_pd(y3,x4); /* z4c y4c | x4c z3c */
1287 x4 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
1288 z3 = gmx_mm256_unpack128hi_pd(z3,y4); /* z4d y4d | x4d z3d */
1290 t1 = _mm256_sub_pd(t1,z4);
1291 t2 = _mm256_sub_pd(t2,y1);
1292 t3 = _mm256_sub_pd(t3,t13);
1293 t4 = _mm256_sub_pd(t4,x1);
1294 t5 = _mm256_sub_pd(t5,z1);
1295 t6 = _mm256_sub_pd(t6,z2);
1296 t7 = _mm256_sub_pd(t7,x2);
1297 t8 = _mm256_sub_pd(t8,y2);
1298 t9 = _mm256_sub_pd(t9,x3);
1299 t10 = _mm256_sub_pd(t10,x4);
1300 t11 = _mm256_sub_pd(t11,y3);
1301 t12 = _mm256_sub_pd(t12,z3);
1303 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
1304 _mm256_storeu_pd(ptrA,t1);
1305 _mm256_storeu_pd(ptrB,t2);
1306 _mm256_storeu_pd(ptrC,t3);
1307 _mm256_storeu_pd(ptrD,t4);
1308 _mm256_storeu_pd(ptrA+4,t5);
1309 _mm256_storeu_pd(ptrB+4,t6);
1310 _mm256_storeu_pd(ptrC+4,t7);
1311 _mm256_storeu_pd(ptrD+4,t8);
1312 _mm256_storeu_pd(ptrA+8,t9);
1313 _mm256_storeu_pd(ptrB+8,t10);
1314 _mm256_storeu_pd(ptrC+8,t11);
1315 _mm256_storeu_pd(ptrD+8,t12);
1322 static gmx_inline void
1323 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1324 double * gmx_restrict fptr,
1325 double * gmx_restrict fshiftptr)
1329 fix1 = _mm256_hadd_pd(fix1,fiy1);
1330 fiz1 = _mm256_hadd_pd(fiz1,_mm256_setzero_pd());
1332 /* Add across the two lanes */
1333 tA = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1));
1334 tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1));
1336 fix1 = gmx_mm256_set_m128(tB,tA); /* 0 fiz fiy fix */
1338 t1 = _mm256_loadu_pd(fptr);
1339 t2 = _mm256_loadu_pd(fshiftptr);
1341 t1 = _mm256_add_pd(t1,fix1);
1342 t2 = _mm256_add_pd(t2,fix1);
1344 _mm256_storeu_pd(fptr,t1);
1345 _mm256_storeu_pd(fshiftptr,t2);
1348 static gmx_inline void
1349 gmx_mm256_update_iforce_2atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1350 __m256d fix2, __m256d fiy2, __m256d fiz2,
1351 double * gmx_restrict fptr,
1352 double * gmx_restrict fshiftptr)
1355 __m128d tA,tB,tC,tD,tE;
1357 fix1 = _mm256_hadd_pd(fix1,fiy1);
1358 fiz1 = _mm256_hadd_pd(fiz1,fix2);
1359 fiy2 = _mm256_hadd_pd(fiy2,fiz2);
1361 /* Add across the two lanes by swapping and adding back */
1362 tA = _mm_add_pd(_mm256_castpd256_pd128(fix1),_mm256_extractf128_pd(fix1,0x1)); /* fiy1 fix1 */
1363 tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1),_mm256_extractf128_pd(fiz1,0x1)); /* fix2 fiz1 */
1364 tC = _mm_add_pd(_mm256_castpd256_pd128(fiy2),_mm256_extractf128_pd(fiy2,0x1)); /* fiz2 fiy2 */
1366 t1 = gmx_mm256_set_m128(tB,tA); /* fix2 fiz1 | fiy1 fix1 */
1368 t2 = _mm256_loadu_pd(fptr);
1369 tD = _mm_loadu_pd(fptr+4);
1371 t2 = _mm256_add_pd(t2,t1);
1372 tD = _mm_add_pd(tD,tC);
1373 _mm256_storeu_pd(fptr,t2);
1374 _mm_storeu_pd(fptr+4,tD);
1376 /* Add up shift force */
1377 /* t1: fix2 fiz1 | fiy1 fix1 */
1380 tA = _mm256_extractf128_pd(t1,0x1); /* fix2 fiz1 */
1381 tB = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1)); /* fiy2 fix2 */
1382 tC = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1)); /* - fiz2 */
1384 tB = _mm_add_pd(tB,_mm256_castpd256_pd128(t1));
1385 tC = _mm_add_sd(tC,tA);
1387 tD = _mm_loadu_pd(fshiftptr);
1388 tE = _mm_load_sd(fshiftptr+2);
1390 tD = _mm_add_pd(tD,tB);
1391 tE = _mm_add_pd(tE,tC);
1393 _mm_storeu_pd(fshiftptr,tD);
1394 _mm_store_sd(fshiftptr+2,tE);
1399 static gmx_inline void
1400 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1401 __m256d fix2, __m256d fiy2, __m256d fiz2,
1402 __m256d fix3, __m256d fiy3, __m256d fiz3,
1403 double * gmx_restrict fptr,
1404 double * gmx_restrict fshiftptr)
1406 __m256d t1,t2,t3,t4;
1407 __m128d tz3,tA,tB,tC,tD;
1409 fix1 = _mm256_hadd_pd(fix1,fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
1410 fiz1 = _mm256_hadd_pd(fiz1,fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
1411 fiy2 = _mm256_hadd_pd(fiy2,fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
1412 fix3 = _mm256_hadd_pd(fix3,fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
1413 fiz3 = _mm256_hadd_pd(fiz3,_mm256_setzero_pd()); /* 0 Z3c-d | 0 Z3a-b */
1415 /* Add across the two lanes by swapping and adding back */
1416 t1 = gmx_mm256_unpack128lo_pd(fix1,fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1417 t2 = gmx_mm256_unpack128hi_pd(fix1,fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1418 t1 = _mm256_add_pd(t1,t2); /* x2 z1 | y1 x1 */
1420 t3 = gmx_mm256_unpack128lo_pd(fiy2,fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1421 t4 = gmx_mm256_unpack128hi_pd(fiy2,fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1422 t3 = _mm256_add_pd(t3,t4); /* y3 x3 | z2 y2 */
1424 tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3),_mm256_extractf128_pd(fiz3,0x1)); /* 0 z3 */
1426 t2 = _mm256_loadu_pd(fptr);
1427 t4 = _mm256_loadu_pd(fptr+4);
1428 tA = _mm_load_sd(fptr+8);
1430 t2 = _mm256_add_pd(t2,t1);
1431 t4 = _mm256_add_pd(t4,t3);
1432 tA = _mm_add_sd(tA,tz3);
1434 _mm256_storeu_pd(fptr,t2);
1435 _mm256_storeu_pd(fptr+4,t4);
1436 _mm_store_sd(fptr+8,tA);
1438 /* Add up shift force */
1439 /* t1: x2 z1 | y1 x1 */
1440 /* t3: y3 x3 | z2 y2 */
1444 tB = _mm256_extractf128_pd(t1,0x1); /* x2 z1 */
1445 tC = _mm256_extractf128_pd(t3,0x1); /* y3 x3 */
1446 tz3 = _mm_add_sd(tz3,tB); /* 0 z1+z3 */
1447 tD = _mm_permute_pd(_mm256_castpd256_pd128(t3),_GMX_MM_PERMUTE128D(1,1));
1448 tz3 = _mm_add_sd(tz3,tD); /* - z */
1450 tC = _mm_add_pd(tC,_mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
1452 tD = _mm_shuffle_pd(tB,_mm256_castpd256_pd128(t3),_MM_SHUFFLE2(0,1)); /* y2 x2 */
1453 tC = _mm_add_pd(tC,tD); /* y x */
1455 tA = _mm_loadu_pd(fshiftptr);
1456 tB = _mm_load_sd(fshiftptr+2);
1457 tA = _mm_add_pd(tA,tC);
1458 tB = _mm_add_sd(tB,tz3);
1459 _mm_storeu_pd(fshiftptr,tA);
1460 _mm_store_sd(fshiftptr+2,tB);
1464 static gmx_inline void
1465 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1466 __m256d fix2, __m256d fiy2, __m256d fiz2,
1467 __m256d fix3, __m256d fiy3, __m256d fiz3,
1468 __m256d fix4, __m256d fiy4, __m256d fiz4,
1469 double * gmx_restrict fptr,
1470 double * gmx_restrict fshiftptr)
1472 __m256d t1,t2,t3,t4,t5,t6;
1473 __m128d tA,tB,tC,tD;
1475 fix1 = _mm256_hadd_pd(fix1,fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
1476 fiz1 = _mm256_hadd_pd(fiz1,fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
1477 fiy2 = _mm256_hadd_pd(fiy2,fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
1478 fix3 = _mm256_hadd_pd(fix3,fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
1479 fiz3 = _mm256_hadd_pd(fiz3,fix4); /* X4c-d Z3c-d | X4a-b Z3a-b */
1480 fiy4 = _mm256_hadd_pd(fiy4,fiz4); /* Z4c-d Y4c-d | Z4a-b Y4a-b */
1482 /* Add across the two lanes by swapping and adding back */
1483 t1 = gmx_mm256_unpack128lo_pd(fix1,fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1484 t2 = gmx_mm256_unpack128hi_pd(fix1,fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1485 t1 = _mm256_add_pd(t1,t2); /* x2 z1 | y1 x1 */
1487 t3 = gmx_mm256_unpack128lo_pd(fiy2,fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1488 t4 = gmx_mm256_unpack128hi_pd(fiy2,fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1489 t3 = _mm256_add_pd(t3,t4); /* y3 x3 | z2 y2 */
1491 t5 = gmx_mm256_unpack128lo_pd(fiz3,fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1492 t6 = gmx_mm256_unpack128hi_pd(fiz3,fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1493 t5 = _mm256_add_pd(t5,t6); /* z4 y4 | x4 z3 */
1495 t2 = _mm256_loadu_pd(fptr);
1496 t4 = _mm256_loadu_pd(fptr+4);
1497 t6 = _mm256_loadu_pd(fptr+8);
1499 t2 = _mm256_add_pd(t2,t1);
1500 t4 = _mm256_add_pd(t4,t3);
1501 t6 = _mm256_add_pd(t6,t5);
1503 _mm256_storeu_pd(fptr,t2);
1504 _mm256_storeu_pd(fptr+4,t4);
1505 _mm256_storeu_pd(fptr+8,t6);
1507 /* Add up shift force */
1508 /* t1: x2. z1. | y1. x1. */
1509 /* t3: y3. x3. | z2 y2 */
1510 /* t5: z4 y4 | x4. z3. */
1513 tA = _mm256_extractf128_pd(t1,0x1); /* x2 z1 */
1514 tB = _mm256_extractf128_pd(t3,0x1); /* y3 x3 */
1515 tC = _mm256_extractf128_pd(t5,0x1); /* z4 y4 */
1517 tB = _mm_add_pd(tB,_mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
1518 tA = _mm_add_pd(tA,_mm256_castpd256_pd128(t5)); /* x2+x4 z1+z3 */
1519 tC = _mm_add_pd(tC,_mm256_castpd256_pd128(t3)); /* z4+z2 y4+y2 */
1521 tD = _mm_shuffle_pd(tA,tC,_MM_SHUFFLE2(0,1)); /* y4+y2 x2+x4 */
1522 tB = _mm_add_pd(tB,tD); /* y x */
1523 tC = _mm_permute_pd(tC,_GMX_MM_PERMUTE128D(1,1)); /* - z4+z2 */
1524 tC = _mm_add_sd(tC,tA); /* - z */
1526 tA = _mm_loadu_pd(fshiftptr);
1527 tD = _mm_load_sd(fshiftptr+2);
1528 tA = _mm_add_pd(tA,tB);
1529 tD = _mm_add_sd(tD,tC);
1530 _mm_storeu_pd(fshiftptr,tA);
1531 _mm_store_sd(fshiftptr+2,tD);
1537 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1541 pot1 = _mm256_hadd_pd(pot1,pot1);
1543 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1),_mm256_extractf128_pd(pot1,0x1));
1545 _mm_store_sd(ptrA,_mm_add_sd(_mm_load_sd(ptrA),t1));
1549 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1550 __m256d pot2, double * gmx_restrict ptrB)
1554 pot1 = _mm256_hadd_pd(pot1,pot2);
1556 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1),_mm256_extractf128_pd(pot1,0x1));
1558 t2 = _mm_permute_pd(t1,_GMX_MM_PERMUTE128D(1,1));
1559 _mm_store_sd(ptrA,_mm_add_sd(_mm_load_sd(ptrA),t1));
1560 _mm_store_sd(ptrB,_mm_add_sd(_mm_load_sd(ptrB),t2));
1565 gmx_mm256_update_4pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1566 __m256d pot2, double * gmx_restrict ptrB,
1567 __m256d pot3, double * gmx_restrict ptrC,
1568 __m256d pot4, double * gmx_restrict ptrD)
1570 __m256d t1,t2,t3,t4;
1571 __m128d tA,tB,tC,tD,tE,tF,tG,tH;
1573 tA = _mm_load_sd(ptrA);
1574 tB = _mm_load_sd(ptrB);
1575 tC = _mm_load_sd(ptrC);
1576 tD = _mm_load_sd(ptrD);
1578 /* do a transpose */
1579 t1 = _mm256_unpacklo_pd(pot1, pot2); /* p2c p1c | p2a p1a */
1580 t2 = _mm256_unpackhi_pd(pot1, pot2); /* p2d p1d | p2b p1b */
1581 t3 = _mm256_unpacklo_pd(pot3, pot4); /* p4c p3c | p4a p3a */
1582 t4 = _mm256_unpackhi_pd(pot3, pot4); /* p4d p3d | p4b p3b */
1583 pot1 = _mm256_permute2f128_pd(t1, t3, 0x20); /* p4a p3a | p2a p1a */
1584 pot2 = _mm256_permute2f128_pd(t2, t4, 0x20); /* p4b p3b | p2b p1b */
1585 pot3 = _mm256_permute2f128_pd(t1, t3, 0x31); /* p4c p3c | p2c p1c */
1586 pot4 = _mm256_permute2f128_pd(t2, t4, 0x31); /* p4d p3d | p2d p1d */
1588 pot1 = _mm256_add_pd(pot1,pot2);
1589 pot3 = _mm256_add_pd(pot3,pot4);
1590 pot1 = _mm256_add_pd(pot1,pot3); /* Sum in the four elements */
1592 tE = _mm256_castpd256_pd128(pot1);
1593 tF = _mm_permute_pd(tE,_GMX_MM_PERMUTE128D(1,1));
1594 tG = _mm256_extractf128_pd(pot1,0x1);
1595 tH = _mm_permute_pd(tG,_GMX_MM_PERMUTE128D(1,1));
1597 tA = _mm_add_sd(tA,tE);
1598 tB = _mm_add_sd(tB,tF);
1599 tC = _mm_add_sd(tC,tG);
1600 tD = _mm_add_sd(tD,tH);
1602 _mm_store_sd(ptrA,tA);
1603 _mm_store_sd(ptrB,tB);
1604 _mm_store_sd(ptrC,tC);
1605 _mm_store_sd(ptrD,tD);
1609 #endif /* _kernelutil_x86_avx_256_double_h_ */