2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_256_double_h_
36 #define _kernelutil_x86_avx_256_double_h_
39 #include "gromacs/simd/general_x86_avx_256.h"
43 gmx_mm256_any_lt(__m256d a, __m256d b)
45 return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
48 static gmx_inline __m256d
49 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
51 return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
54 /* Normal sum of four ymm registers */
55 #define gmx_mm256_sum4_pd(t0, t1, t2, t3) _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
58 /* Load a single value from 1-4 places, merge into xmm register */
60 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
62 return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
66 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
67 const double * gmx_restrict ptrB)
71 tA = _mm_load_sd(ptrA);
72 tB = _mm_load_sd(ptrB);
74 return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
79 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
80 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
84 t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
85 t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
86 return gmx_mm256_set_m128d(t2, t1);
92 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
94 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
99 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
103 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
104 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
105 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
112 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
113 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
118 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
119 t3 = _mm256_extractf128_pd(xmm1, 0x1);
120 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
121 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
122 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
123 _mm_store_sd(ptrC, t3);
124 _mm_store_sd(ptrD, t4);
131 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
135 t1 = _mm256_castpd256_pd128(xmm1);
136 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
138 _mm_store_sd(ptrA, t1);
143 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
147 t1 = _mm256_castpd256_pd128(xmm1);
148 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
150 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
151 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
153 _mm_store_sd(ptrA, t1);
154 _mm_store_sd(ptrB, t2);
159 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
160 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
162 __m128d t1, t2, t3, t4;
164 t1 = _mm256_castpd256_pd128(xmm1);
165 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
166 t3 = _mm256_extractf128_pd(xmm1, 0x1);
167 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
169 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
170 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
171 t3 = _mm_add_sd(t3, _mm_load_sd(ptrC));
172 t4 = _mm_add_sd(t4, _mm_load_sd(ptrD));
174 _mm_store_sd(ptrA, t1);
175 _mm_store_sd(ptrB, t2);
176 _mm_store_sd(ptrC, t3);
177 _mm_store_sd(ptrD, t4);
183 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
185 *c6 = _mm256_castpd128_pd256(_mm_load_sd(p1));
186 *c12 = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
191 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
195 t1 = _mm_loadu_pd(p1);
196 t2 = _mm_loadu_pd(p2);
197 *c6 = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
198 *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
204 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
205 const double * gmx_restrict p3, const double * gmx_restrict p4,
206 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
210 t1 = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
211 t2 = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
213 *c6 = _mm256_unpacklo_pd(t1, t2); /* c6d c6c | c6b c6a */
214 *c12 = _mm256_unpackhi_pd(t1, t2); /* c12d c12c | c12b c12a */
218 static gmx_inline void
219 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
220 const double * gmx_restrict xyz,
221 __m256d * gmx_restrict x1,
222 __m256d * gmx_restrict y1,
223 __m256d * gmx_restrict z1)
225 __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
227 mem_xy = _mm_loadu_pd(xyz);
228 mem_z = _mm_load_sd(xyz+2);
229 mem_sxy = _mm_loadu_pd(xyz_shift);
230 mem_sz = _mm_load_sd(xyz_shift+2);
232 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
233 mem_z = _mm_add_pd(mem_z, mem_sz);
235 tx = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
236 ty = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
237 tz = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
239 *x1 = gmx_mm256_set_m128d(tx, tx);
240 *y1 = gmx_mm256_set_m128d(ty, ty);
241 *z1 = gmx_mm256_set_m128d(tz, tz);
245 static gmx_inline void
246 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
247 const double * gmx_restrict xyz,
248 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
249 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
250 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
252 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
254 t1 = _mm_loadu_pd(xyz);
255 t2 = _mm_loadu_pd(xyz+2);
256 t3 = _mm_loadu_pd(xyz+4);
257 t4 = _mm_loadu_pd(xyz+6);
258 t5 = _mm_load_sd(xyz+8);
260 sxy = _mm_loadu_pd(xyz_shift);
261 sz = _mm_load_sd(xyz_shift+2);
262 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
263 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
265 t1 = _mm_add_pd(t1, sxy);
266 t2 = _mm_add_pd(t2, szx);
267 t3 = _mm_add_pd(t3, syz);
268 t4 = _mm_add_pd(t4, sxy);
269 t5 = _mm_add_sd(t5, sz);
271 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
272 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
273 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
274 *x1 = gmx_mm256_set_m128d(tx, tx);
275 *y1 = gmx_mm256_set_m128d(ty, ty);
276 *z1 = gmx_mm256_set_m128d(tz, tz);
277 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
278 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
279 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
280 *x2 = gmx_mm256_set_m128d(tx, tx);
281 *y2 = gmx_mm256_set_m128d(ty, ty);
282 *z2 = gmx_mm256_set_m128d(tz, tz);
283 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
284 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
285 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
286 *x3 = gmx_mm256_set_m128d(tx, tx);
287 *y3 = gmx_mm256_set_m128d(ty, ty);
288 *z3 = gmx_mm256_set_m128d(tz, tz);
292 static gmx_inline void
293 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
294 const double * gmx_restrict xyz,
295 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
296 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
297 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
298 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
300 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
302 t1 = _mm_loadu_pd(xyz);
303 t2 = _mm_loadu_pd(xyz+2);
304 t3 = _mm_loadu_pd(xyz+4);
305 t4 = _mm_loadu_pd(xyz+6);
306 t5 = _mm_loadu_pd(xyz+8);
307 t6 = _mm_loadu_pd(xyz+10);
309 sxy = _mm_loadu_pd(xyz_shift);
310 sz = _mm_load_sd(xyz_shift+2);
311 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
312 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
314 t1 = _mm_add_pd(t1, sxy);
315 t2 = _mm_add_pd(t2, szx);
316 t3 = _mm_add_pd(t3, syz);
317 t4 = _mm_add_pd(t4, sxy);
318 t5 = _mm_add_pd(t5, szx);
319 t6 = _mm_add_pd(t6, syz);
321 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
322 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
323 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
324 *x1 = gmx_mm256_set_m128d(tx, tx);
325 *y1 = gmx_mm256_set_m128d(ty, ty);
326 *z1 = gmx_mm256_set_m128d(tz, tz);
327 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
328 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
329 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
330 *x2 = gmx_mm256_set_m128d(tx, tx);
331 *y2 = gmx_mm256_set_m128d(ty, ty);
332 *z2 = gmx_mm256_set_m128d(tz, tz);
333 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
334 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
335 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
336 *x3 = gmx_mm256_set_m128d(tx, tx);
337 *y3 = gmx_mm256_set_m128d(ty, ty);
338 *z3 = gmx_mm256_set_m128d(tz, tz);
339 tx = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
340 ty = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
341 tz = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
342 *x4 = gmx_mm256_set_m128d(tx, tx);
343 *y4 = gmx_mm256_set_m128d(ty, ty);
344 *z4 = gmx_mm256_set_m128d(tz, tz);
349 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
350 __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
354 t1 = _mm256_loadu_pd(p1);
356 *y = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
357 *z = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
362 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
363 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
364 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
365 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
367 __m256d t1, t2, t3, t4;
369 t1 = _mm256_loadu_pd(p1);
370 t3 = _mm256_loadu_pd(p1+4);
373 t2 = gmx_mm256_unpack128hi_pd(t1, t1);
374 t4 = gmx_mm256_unpack128hi_pd(t3, t3);
377 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
378 *z2 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
379 *x2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
380 *y3 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
381 *z3 = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
385 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
386 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
387 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
388 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
389 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
391 __m256d t1, t2, t3, t4, t5, t6;
393 t1 = _mm256_loadu_pd(p1);
394 t2 = _mm256_loadu_pd(p1+4);
395 t3 = _mm256_loadu_pd(p1+8);
397 t4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
398 t5 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
399 t6 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
408 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
409 *z2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
410 *x4 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
411 *x2 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
412 *y3 = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
413 *z4 = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
418 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
419 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
420 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
422 __m256d t1, t2, t3, t4, t5, t6;
424 t1 = _mm256_loadu_pd(ptrA); /* - z1a | y1a x1a */
425 t2 = _mm256_loadu_pd(ptrB); /* - z1b | y1b x1b */
426 t3 = _mm256_loadu_pd(ptrC); /* - z1c | y1c x1c */
427 t4 = _mm256_loadu_pd(ptrD); /* - z1d | y1d x1d */
429 t5 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
430 t6 = _mm256_unpackhi_pd(t1, t2); /* - - | y1b y1a */
431 t1 = _mm256_unpacklo_pd(t3, t4); /* z1c z1c | x1d x1c */
432 t2 = _mm256_unpackhi_pd(t3, t4); /* - - | y1d y1c */
434 *x1 = gmx_mm256_unpack128lo_pd(t5, t1);
435 *y1 = gmx_mm256_unpack128lo_pd(t6, t2);
436 *z1 = gmx_mm256_unpack128hi_pd(t5, t1);
442 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
443 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
444 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
445 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
446 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
448 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
450 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
451 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
452 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
453 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
454 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
455 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
456 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
457 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
458 t9 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
459 t10 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
460 t11 = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /* - - | - z3c */
461 t12 = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /* - - | - z3d */
463 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
464 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
465 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
466 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
468 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
469 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
470 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
471 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
473 t9 = _mm256_unpacklo_pd(t9, t10); /* - - | z3b z3a */
474 t11 = _mm256_unpacklo_pd(t11, t12); /* - - | z3d z3c */
476 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
477 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
478 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
479 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
480 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
481 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
482 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
483 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
484 *z3 = gmx_mm256_unpack128lo_pd(t9, t11);
490 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
491 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
492 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
493 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
494 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
495 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
497 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
499 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
500 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
501 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
502 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
503 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
504 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
505 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
506 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
507 t9 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
508 t10 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
509 t11 = _mm256_loadu_pd(ptrC+8); /* z4c y4c | x4c z3c */
510 t12 = _mm256_loadu_pd(ptrD+8); /* z4d y4d | x4d z3d */
512 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
513 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
514 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
515 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
517 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
518 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
519 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
520 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
522 t7 = _mm256_unpacklo_pd(t9, t10); /* y4b y4a | z3b z3a */
523 t8 = _mm256_unpackhi_pd(t9, t10); /* z4b z4a | x4b x4a */
524 t9 = _mm256_unpacklo_pd(t11, t12); /* y4d y4c | z3d z3c */
525 t10 = _mm256_unpackhi_pd(t11, t12); /* z4d z4c | x4d x4c */
527 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
528 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
529 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
530 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
531 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
532 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
533 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
534 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
535 *z3 = gmx_mm256_unpack128lo_pd(t7, t9);
536 *x4 = gmx_mm256_unpack128lo_pd(t8, t10);
537 *y4 = gmx_mm256_unpack128hi_pd(t7, t9);
538 *z4 = gmx_mm256_unpack128hi_pd(t8, t10);
544 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
545 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
546 __m256d x1, __m256d y1, __m256d z1)
548 __m256d t1, t2, tA, tB, tC, tD;
551 t1 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
552 t2 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
553 x1 = gmx_mm256_unpack128lo_pd(t1, z1); /* - z1a | y1a x1a */
554 y1 = gmx_mm256_unpack128hi_pd(t1, z1); /* - z1c | y1c x1c */
555 z1 = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
556 t1 = gmx_mm256_unpack128lo_pd(t2, z1); /* - z1b | y1b x1b */
557 z1 = gmx_mm256_unpack128hi_pd(t2, z1); /* - z1d | y1d x1d */
559 /* Construct a mask without executing any data loads */
560 mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
561 _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
563 tA = _mm256_loadu_pd(ptrA);
564 tB = _mm256_loadu_pd(ptrB);
565 tC = _mm256_loadu_pd(ptrC);
566 tD = _mm256_loadu_pd(ptrD);
568 tA = _mm256_sub_pd(tA, x1);
569 tB = _mm256_sub_pd(tB, t1);
570 tC = _mm256_sub_pd(tC, y1);
571 tD = _mm256_sub_pd(tD, z1);
573 _mm256_maskstore_pd(ptrA, mask, tA);
574 _mm256_maskstore_pd(ptrB, mask, tB);
575 _mm256_maskstore_pd(ptrC, mask, tC);
576 _mm256_maskstore_pd(ptrD, mask, tD);
581 #if defined (_MSC_VER) && defined(_M_IX86)
582 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
583 #define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
584 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
586 __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
587 __m128d _tA, _tB, _tC, _tD, _tE; \
588 _t1 = _mm256_loadu_pd(ptrA); \
589 _t2 = _mm256_loadu_pd(ptrB); \
590 _t3 = _mm256_loadu_pd(ptrC); \
591 _t4 = _mm256_loadu_pd(ptrD); \
592 _t5 = _mm256_loadu_pd(ptrA+4); \
593 _t6 = _mm256_loadu_pd(ptrB+4); \
594 _t7 = _mm256_loadu_pd(ptrC+4); \
595 _t8 = _mm256_loadu_pd(ptrD+4); \
596 _tA = _mm_load_sd(ptrA+8); \
597 _tB = _mm_load_sd(ptrB+8); \
598 _tC = _mm_load_sd(ptrC+8); \
599 _tD = _mm_load_sd(ptrD+8); \
600 _t9 = _mm256_unpacklo_pd(_x1, _y1); \
601 _x1 = _mm256_unpackhi_pd(_x1, _y1); \
602 _y1 = _mm256_unpacklo_pd(_z1, _x2); \
603 _z1 = _mm256_unpackhi_pd(_z1, _x2); \
604 _x2 = _mm256_unpacklo_pd(_y2, _z2); \
605 _y2 = _mm256_unpackhi_pd(_y2, _z2); \
606 _z2 = _mm256_unpacklo_pd(_x3, _y3); \
607 _x3 = _mm256_unpackhi_pd(_x3, _y3); \
608 _t10 = gmx_mm256_unpack128lo_pd(_t9, _y1); \
609 _y3 = gmx_mm256_unpack128hi_pd(_t9, _y1); \
610 _t9 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
611 _y1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
612 _x1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
613 _z1 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
614 _x2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
615 _z2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
616 _t1 = _mm256_sub_pd(_t1, _t10); \
617 _t2 = _mm256_sub_pd(_t2, _t9); \
618 _t3 = _mm256_sub_pd(_t3, _y3); \
619 _t4 = _mm256_sub_pd(_t4, _y1); \
620 _t5 = _mm256_sub_pd(_t5, _x1); \
621 _t6 = _mm256_sub_pd(_t6, _x2); \
622 _t7 = _mm256_sub_pd(_t7, _z1); \
623 _t8 = _mm256_sub_pd(_t8, _z2); \
624 _tA = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
625 _tB = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
626 _tE = _mm256_extractf128_pd(_z3, 0x1); \
627 _tC = _mm_sub_sd(_tC, _tE); \
628 _tD = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
629 _mm256_storeu_pd(ptrA, _t1); \
630 _mm256_storeu_pd(ptrB, _t2); \
631 _mm256_storeu_pd(ptrC, _t3); \
632 _mm256_storeu_pd(ptrD, _t4); \
633 _mm256_storeu_pd(ptrA+4, _t5); \
634 _mm256_storeu_pd(ptrB+4, _t6); \
635 _mm256_storeu_pd(ptrC+4, _t7); \
636 _mm256_storeu_pd(ptrD+4, _t8); \
637 _mm_store_sd(ptrA+8, _tA); \
638 _mm_store_sd(ptrB+8, _tB); \
639 _mm_store_sd(ptrC+8, _tC); \
640 _mm_store_sd(ptrD+8, _tD); \
643 /* Real function for sane compilers */
645 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
646 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
647 __m256d x1, __m256d y1, __m256d z1,
648 __m256d x2, __m256d y2, __m256d z2,
649 __m256d x3, __m256d y3, __m256d z3)
651 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
652 __m128d tA, tB, tC, tD, tE;
654 t1 = _mm256_loadu_pd(ptrA);
655 t2 = _mm256_loadu_pd(ptrB);
656 t3 = _mm256_loadu_pd(ptrC);
657 t4 = _mm256_loadu_pd(ptrD);
658 t5 = _mm256_loadu_pd(ptrA+4);
659 t6 = _mm256_loadu_pd(ptrB+4);
660 t7 = _mm256_loadu_pd(ptrC+4);
661 t8 = _mm256_loadu_pd(ptrD+4);
662 tA = _mm_load_sd(ptrA+8);
663 tB = _mm_load_sd(ptrB+8);
664 tC = _mm_load_sd(ptrC+8);
665 tD = _mm_load_sd(ptrD+8);
667 t9 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
668 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
670 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
671 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
673 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
674 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
676 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
677 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
679 t10 = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
680 y3 = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
682 t9 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
683 y1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
685 x1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
686 z1 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
688 x2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
689 z2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
691 t1 = _mm256_sub_pd(t1, t10);
692 t2 = _mm256_sub_pd(t2, t9);
693 t3 = _mm256_sub_pd(t3, y3);
694 t4 = _mm256_sub_pd(t4, y1);
695 t5 = _mm256_sub_pd(t5, x1);
696 t6 = _mm256_sub_pd(t6, x2);
697 t7 = _mm256_sub_pd(t7, z1);
698 t8 = _mm256_sub_pd(t8, z2);
700 tA = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
701 tB = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
702 tE = _mm256_extractf128_pd(z3, 0x1);
703 tC = _mm_sub_sd(tC, tE);
704 tD = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
706 /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
707 _mm256_storeu_pd(ptrA, t1);
708 _mm256_storeu_pd(ptrB, t2);
709 _mm256_storeu_pd(ptrC, t3);
710 _mm256_storeu_pd(ptrD, t4);
711 _mm256_storeu_pd(ptrA+4, t5);
712 _mm256_storeu_pd(ptrB+4, t6);
713 _mm256_storeu_pd(ptrC+4, t7);
714 _mm256_storeu_pd(ptrD+4, t8);
715 _mm_store_sd(ptrA+8, tA);
716 _mm_store_sd(ptrB+8, tB);
717 _mm_store_sd(ptrC+8, tC);
718 _mm_store_sd(ptrD+8, tD);
722 #if defined (_MSC_VER) && defined(_M_IX86)
723 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
724 #define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
725 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
727 __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
728 __m128d _tA, _tB, _tC, _tD, _tE; \
729 _t1 = _mm256_loadu_pd(ptrA); \
730 _t2 = _mm256_loadu_pd(ptrB); \
731 _t3 = _mm256_loadu_pd(ptrC); \
732 _t4 = _mm256_loadu_pd(ptrD); \
733 _t5 = _mm256_loadu_pd(ptrA+4); \
734 _t6 = _mm256_loadu_pd(ptrB+4); \
735 _t7 = _mm256_loadu_pd(ptrC+4); \
736 _t8 = _mm256_loadu_pd(ptrD+4); \
737 _t9 = _mm256_loadu_pd(ptrA+8); \
738 _t10 = _mm256_loadu_pd(ptrB+8); \
739 _t11 = _mm256_loadu_pd(ptrC+8); \
740 _t12 = _mm256_loadu_pd(ptrD+8); \
741 _t13 = _mm256_unpacklo_pd(_x1, _y1); \
742 _x1 = _mm256_unpackhi_pd(_x1, _y1); \
743 _y1 = _mm256_unpacklo_pd(_z1, _x2); \
744 _z1 = _mm256_unpackhi_pd(_z1, _x2); \
745 _x2 = _mm256_unpacklo_pd(_y2, _z2); \
746 _y2 = _mm256_unpackhi_pd(_y2, _z2); \
747 _z2 = _mm256_unpacklo_pd(_x3, _y3); \
748 _x3 = _mm256_unpackhi_pd(_x3, _y3); \
749 _y3 = _mm256_unpacklo_pd(_z3, _x4); \
750 _z3 = _mm256_unpackhi_pd(_z3, _x4); \
751 _x4 = _mm256_unpacklo_pd(_y4, _z4); \
752 _y4 = _mm256_unpackhi_pd(_y4, _z4); \
753 _z4 = gmx_mm256_unpack128lo_pd(_t13, _y1); \
754 _t13 = gmx_mm256_unpack128hi_pd(_t13, _y1); \
755 _y1 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
756 _x1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
757 _z1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
758 _x2 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
759 _z2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
760 _y2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
761 _x3 = gmx_mm256_unpack128lo_pd(_y3, _x4); \
762 _y3 = gmx_mm256_unpack128hi_pd(_y3, _x4); \
763 _x4 = gmx_mm256_unpack128lo_pd(_z3, _y4); \
764 _z3 = gmx_mm256_unpack128hi_pd(_z3, _y4); \
765 _t1 = _mm256_sub_pd(_t1, _z4); \
766 _t2 = _mm256_sub_pd(_t2, _y1); \
767 _t3 = _mm256_sub_pd(_t3, _t13); \
768 _t4 = _mm256_sub_pd(_t4, _x1); \
769 _t5 = _mm256_sub_pd(_t5, _z1); \
770 _t6 = _mm256_sub_pd(_t6, _z2); \
771 _t7 = _mm256_sub_pd(_t7, _x2); \
772 _t8 = _mm256_sub_pd(_t8, _y2); \
773 _t9 = _mm256_sub_pd(_t9, _x3); \
774 _t10 = _mm256_sub_pd(_t10, _x4); \
775 _t11 = _mm256_sub_pd(_t11, _y3); \
776 _t12 = _mm256_sub_pd(_t12, _z3); \
777 _mm256_storeu_pd(ptrA, _t1); \
778 _mm256_storeu_pd(ptrB, _t2); \
779 _mm256_storeu_pd(ptrC, _t3); \
780 _mm256_storeu_pd(ptrD, _t4); \
781 _mm256_storeu_pd(ptrA+4, _t5); \
782 _mm256_storeu_pd(ptrB+4, _t6); \
783 _mm256_storeu_pd(ptrC+4, _t7); \
784 _mm256_storeu_pd(ptrD+4, _t8); \
785 _mm256_storeu_pd(ptrA+8, _t9); \
786 _mm256_storeu_pd(ptrB+8, _t10); \
787 _mm256_storeu_pd(ptrC+8, _t11); \
788 _mm256_storeu_pd(ptrD+8, _t12); \
791 /* Real function for sane compilers */
793 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
794 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
795 __m256d x1, __m256d y1, __m256d z1,
796 __m256d x2, __m256d y2, __m256d z2,
797 __m256d x3, __m256d y3, __m256d z3,
798 __m256d x4, __m256d y4, __m256d z4)
800 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
801 __m128d tA, tB, tC, tD, tE;
803 t1 = _mm256_loadu_pd(ptrA);
804 t2 = _mm256_loadu_pd(ptrB);
805 t3 = _mm256_loadu_pd(ptrC);
806 t4 = _mm256_loadu_pd(ptrD);
807 t5 = _mm256_loadu_pd(ptrA+4);
808 t6 = _mm256_loadu_pd(ptrB+4);
809 t7 = _mm256_loadu_pd(ptrC+4);
810 t8 = _mm256_loadu_pd(ptrD+4);
811 t9 = _mm256_loadu_pd(ptrA+8);
812 t10 = _mm256_loadu_pd(ptrB+8);
813 t11 = _mm256_loadu_pd(ptrC+8);
814 t12 = _mm256_loadu_pd(ptrD+8);
816 t13 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
817 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
818 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
819 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
820 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
821 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
822 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
823 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
824 y3 = _mm256_unpacklo_pd(z3, x4); /* x4c z3c | x4a z3a */
825 z3 = _mm256_unpackhi_pd(z3, x4); /* x4d z3d | x4b z3b */
826 x4 = _mm256_unpacklo_pd(y4, z4); /* z4c y4c | z4a y4a */
827 y4 = _mm256_unpackhi_pd(y4, z4); /* z4d y4d | z4b y4b */
829 z4 = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
830 t13 = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
831 y1 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
832 x1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
833 z1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
834 x2 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
835 z2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
836 y2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
837 x3 = gmx_mm256_unpack128lo_pd(y3, x4); /* z4a y4a | x4a z3a */
838 y3 = gmx_mm256_unpack128hi_pd(y3, x4); /* z4c y4c | x4c z3c */
839 x4 = gmx_mm256_unpack128lo_pd(z3, y4); /* z4b y4b | x4b z3b */
840 z3 = gmx_mm256_unpack128hi_pd(z3, y4); /* z4d y4d | x4d z3d */
842 t1 = _mm256_sub_pd(t1, z4);
843 t2 = _mm256_sub_pd(t2, y1);
844 t3 = _mm256_sub_pd(t3, t13);
845 t4 = _mm256_sub_pd(t4, x1);
846 t5 = _mm256_sub_pd(t5, z1);
847 t6 = _mm256_sub_pd(t6, z2);
848 t7 = _mm256_sub_pd(t7, x2);
849 t8 = _mm256_sub_pd(t8, y2);
850 t9 = _mm256_sub_pd(t9, x3);
851 t10 = _mm256_sub_pd(t10, x4);
852 t11 = _mm256_sub_pd(t11, y3);
853 t12 = _mm256_sub_pd(t12, z3);
855 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
856 _mm256_storeu_pd(ptrA, t1);
857 _mm256_storeu_pd(ptrB, t2);
858 _mm256_storeu_pd(ptrC, t3);
859 _mm256_storeu_pd(ptrD, t4);
860 _mm256_storeu_pd(ptrA+4, t5);
861 _mm256_storeu_pd(ptrB+4, t6);
862 _mm256_storeu_pd(ptrC+4, t7);
863 _mm256_storeu_pd(ptrD+4, t8);
864 _mm256_storeu_pd(ptrA+8, t9);
865 _mm256_storeu_pd(ptrB+8, t10);
866 _mm256_storeu_pd(ptrC+8, t11);
867 _mm256_storeu_pd(ptrD+8, t12);
875 static gmx_inline void
876 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
877 double * gmx_restrict fptr,
878 double * gmx_restrict fshiftptr)
882 fix1 = _mm256_hadd_pd(fix1, fiy1);
883 fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
885 /* Add across the two lanes */
886 tA = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
887 tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
889 fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
891 t1 = _mm256_loadu_pd(fptr);
892 t2 = _mm256_loadu_pd(fshiftptr);
894 t1 = _mm256_add_pd(t1, fix1);
895 t2 = _mm256_add_pd(t2, fix1);
897 _mm256_storeu_pd(fptr, t1);
898 _mm256_storeu_pd(fshiftptr, t2);
903 #if defined (_MSC_VER) && defined(_M_IX86)
904 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
905 #define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
908 __m256d _t1, _t2, _t3, _t4; \
909 __m128d _tz3, _tA, _tB, _tC, _tD; \
910 fix1 = _mm256_hadd_pd(fix1, fiy1); \
911 fiz1 = _mm256_hadd_pd(fiz1, fix2); \
912 fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
913 fix3 = _mm256_hadd_pd(fix3, fiy3); \
914 fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
915 _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
916 _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
917 _t1 = _mm256_add_pd(_t1, _t2); \
918 _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
919 _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
920 _t3 = _mm256_add_pd(_t3, _t4); \
921 _tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
922 _t2 = _mm256_loadu_pd(fptr); \
923 _t4 = _mm256_loadu_pd(fptr+4); \
924 _tA = _mm_load_sd(fptr+8); \
925 _t2 = _mm256_add_pd(_t2, _t1); \
926 _t4 = _mm256_add_pd(_t4, _t3); \
927 _tA = _mm_add_sd(_tA, _tz3); \
928 _mm256_storeu_pd(fptr, _t2); \
929 _mm256_storeu_pd(fptr+4, _t4); \
930 _mm_store_sd(fptr+8, _tA); \
931 _tB = _mm256_extractf128_pd(_t1, 0x1); \
932 _tC = _mm256_extractf128_pd(_t3, 0x1); \
933 _tz3 = _mm_add_sd(_tz3, _tB); \
934 _tD = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
935 _tz3 = _mm_add_sd(_tz3, _tD); \
936 _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
937 _tD = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
938 _tC = _mm_add_pd(_tC, _tD); \
939 _tA = _mm_loadu_pd(fshiftptr); \
940 _tB = _mm_load_sd(fshiftptr+2); \
941 _tA = _mm_add_pd(_tA, _tC); \
942 _tB = _mm_add_sd(_tB, _tz3); \
943 _mm_storeu_pd(fshiftptr, _tA); \
944 _mm_store_sd(fshiftptr+2, _tB); \
947 /* Real function for sane compilers */
948 static gmx_inline void
949 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
950 __m256d fix2, __m256d fiy2, __m256d fiz2,
951 __m256d fix3, __m256d fiy3, __m256d fiz3,
952 double * gmx_restrict fptr,
953 double * gmx_restrict fshiftptr)
955 __m256d t1, t2, t3, t4;
956 __m128d tz3, tA, tB, tC, tD;
958 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
959 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
960 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
961 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
962 fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /* 0 Z3c-d | 0 Z3a-b */
964 /* Add across the two lanes by swapping and adding back */
965 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
966 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
967 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
969 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
970 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
971 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
973 tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
975 t2 = _mm256_loadu_pd(fptr);
976 t4 = _mm256_loadu_pd(fptr+4);
977 tA = _mm_load_sd(fptr+8);
979 t2 = _mm256_add_pd(t2, t1);
980 t4 = _mm256_add_pd(t4, t3);
981 tA = _mm_add_sd(tA, tz3);
983 _mm256_storeu_pd(fptr, t2);
984 _mm256_storeu_pd(fptr+4, t4);
985 _mm_store_sd(fptr+8, tA);
987 /* Add up shift force */
988 /* t1: x2 z1 | y1 x1 */
989 /* t3: y3 x3 | z2 y2 */
993 tB = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
994 tC = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
995 tz3 = _mm_add_sd(tz3, tB); /* 0 z1+z3 */
996 tD = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
997 tz3 = _mm_add_sd(tz3, tD); /* - z */
999 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
1001 tD = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
1002 tC = _mm_add_pd(tC, tD); /* y x */
1004 tA = _mm_loadu_pd(fshiftptr);
1005 tB = _mm_load_sd(fshiftptr+2);
1006 tA = _mm_add_pd(tA, tC);
1007 tB = _mm_add_sd(tB, tz3);
1008 _mm_storeu_pd(fshiftptr, tA);
1009 _mm_store_sd(fshiftptr+2, tB);
1014 #if defined (_MSC_VER) && defined(_M_IX86)
1015 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
1016 #define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
1019 __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
1020 __m128d _tA, _tB, _tC, _tD; \
1021 fix1 = _mm256_hadd_pd(fix1, fiy1); \
1022 fiz1 = _mm256_hadd_pd(fiz1, fix2); \
1023 fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
1024 fix3 = _mm256_hadd_pd(fix3, fiy3); \
1025 fiz3 = _mm256_hadd_pd(fiz3, fix4); \
1026 fiy4 = _mm256_hadd_pd(fiy4, fiz4); \
1027 _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
1028 _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
1029 _t1 = _mm256_add_pd(_t1, _t2); \
1030 _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
1031 _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
1032 _t3 = _mm256_add_pd(_t3, _t4); \
1033 _t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
1034 _t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
1035 _t5 = _mm256_add_pd(_t5, _t6); \
1036 _t2 = _mm256_loadu_pd(fptr); \
1037 _t4 = _mm256_loadu_pd(fptr+4); \
1038 _t6 = _mm256_loadu_pd(fptr+8); \
1039 _t2 = _mm256_add_pd(_t2, _t1); \
1040 _t4 = _mm256_add_pd(_t4, _t3); \
1041 _t6 = _mm256_add_pd(_t6, _t5); \
1042 _mm256_storeu_pd(fptr, _t2); \
1043 _mm256_storeu_pd(fptr+4, _t4); \
1044 _mm256_storeu_pd(fptr+8, _t6); \
1045 _tA = _mm256_extractf128_pd(_t1, 0x1); \
1046 _tB = _mm256_extractf128_pd(_t3, 0x1); \
1047 _tC = _mm256_extractf128_pd(_t5, 0x1); \
1048 _tB = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
1049 _tA = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
1050 _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
1051 _tD = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
1052 _tB = _mm_add_pd(_tB, _tD); \
1053 _tC = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
1054 _tC = _mm_add_sd(_tC, _tA); \
1055 _tA = _mm_loadu_pd(fshiftptr); \
1056 _tD = _mm_load_sd(fshiftptr+2); \
1057 _tA = _mm_add_pd(_tA, _tB); \
1058 _tD = _mm_add_sd(_tD, _tC); \
1059 _mm_storeu_pd(fshiftptr, _tA); \
1060 _mm_store_sd(fshiftptr+2, _tD); \
1063 /* Real function for sane compilers */
1064 static gmx_inline void
1065 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
1066 __m256d fix2, __m256d fiy2, __m256d fiz2,
1067 __m256d fix3, __m256d fiy3, __m256d fiz3,
1068 __m256d fix4, __m256d fiy4, __m256d fiz4,
1069 double * gmx_restrict fptr,
1070 double * gmx_restrict fshiftptr)
1072 __m256d t1, t2, t3, t4, t5, t6;
1073 __m128d tA, tB, tC, tD;
1075 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
1076 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
1077 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
1078 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
1079 fiz3 = _mm256_hadd_pd(fiz3, fix4); /* X4c-d Z3c-d | X4a-b Z3a-b */
1080 fiy4 = _mm256_hadd_pd(fiy4, fiz4); /* Z4c-d Y4c-d | Z4a-b Y4a-b */
1082 /* Add across the two lanes by swapping and adding back */
1083 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
1084 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
1085 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
1087 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
1088 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
1089 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
1091 t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
1092 t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
1093 t5 = _mm256_add_pd(t5, t6); /* z4 y4 | x4 z3 */
1095 t2 = _mm256_loadu_pd(fptr);
1096 t4 = _mm256_loadu_pd(fptr+4);
1097 t6 = _mm256_loadu_pd(fptr+8);
1099 t2 = _mm256_add_pd(t2, t1);
1100 t4 = _mm256_add_pd(t4, t3);
1101 t6 = _mm256_add_pd(t6, t5);
1103 _mm256_storeu_pd(fptr, t2);
1104 _mm256_storeu_pd(fptr+4, t4);
1105 _mm256_storeu_pd(fptr+8, t6);
1107 /* Add up shift force */
1108 /* t1: x2. z1. | y1. x1. */
1109 /* t3: y3. x3. | z2 y2 */
1110 /* t5: z4 y4 | x4. z3. */
1113 tA = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
1114 tB = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
1115 tC = _mm256_extractf128_pd(t5, 0x1); /* z4 y4 */
1117 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
1118 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t5)); /* x2+x4 z1+z3 */
1119 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t3)); /* z4+z2 y4+y2 */
1121 tD = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1)); /* y4+y2 x2+x4 */
1122 tB = _mm_add_pd(tB, tD); /* y x */
1123 tC = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /* - z4+z2 */
1124 tC = _mm_add_sd(tC, tA); /* - z */
1126 tA = _mm_loadu_pd(fshiftptr);
1127 tD = _mm_load_sd(fshiftptr+2);
1128 tA = _mm_add_pd(tA, tB);
1129 tD = _mm_add_sd(tD, tC);
1130 _mm_storeu_pd(fshiftptr, tA);
1131 _mm_store_sd(fshiftptr+2, tD);
1138 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
1142 pot1 = _mm256_hadd_pd(pot1, pot1);
1144 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1146 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1150 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
1151 __m256d pot2, double * gmx_restrict ptrB)
1155 pot1 = _mm256_hadd_pd(pot1, pot2);
1157 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
1159 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
1160 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
1161 _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
1165 #endif /* _kernelutil_x86_avx_256_double_h_ */