2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_256_double_h_
36 #define _kernelutil_x86_avx_256_double_h_
38 #define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
40 #define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
41 #define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
42 #define _GMX_MM_PERMUTE128D(fp1, fp0) (((fp1) << 1) | ((fp0)))
43 #define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
44 #define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
46 __m256d _t0, _t1, _t2, _t3; \
47 _t0 = _mm256_unpacklo_pd((row0), (row1)); \
48 _t1 = _mm256_unpackhi_pd((row0), (row1)); \
49 _t2 = _mm256_unpacklo_pd((row2), (row3)); \
50 _t3 = _mm256_unpackhi_pd((row2), (row3)); \
51 row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20); \
52 row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20); \
53 row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31); \
54 row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31); \
57 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
59 static gmx_inline __m256d gmx_simdcall
60 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
62 return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
65 static gmx_inline __m256d gmx_simdcall
66 gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
68 return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
71 static gmx_inline __m256d gmx_simdcall
72 gmx_mm256_set_m128d(__m128d hi, __m128d lo)
74 return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
77 static gmx_inline __m256 gmx_simdcall
78 gmx_mm256_set_m128(__m128 hi, __m128 lo)
80 return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
83 static gmx_inline int gmx_simdcall
84 gmx_mm256_any_lt(__m256d a, __m256d b)
86 return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
89 static gmx_inline __m256d gmx_simdcall
90 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
92 return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
95 /* Normal sum of four ymm registers */
96 #define gmx_mm256_sum4_pd(t0, t1, t2, t3) _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
99 /* Load a single value from 1-4 places, merge into xmm register */
100 static gmx_inline __m256d gmx_simdcall
101 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
103 return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
106 static gmx_inline __m256d gmx_simdcall
107 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
108 const double * gmx_restrict ptrB)
112 tA = _mm_load_sd(ptrA);
113 tB = _mm_load_sd(ptrB);
115 return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
119 static gmx_inline __m256d gmx_simdcall
120 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
121 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
125 t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
126 t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
127 return gmx_mm256_set_m128d(t2, t1);
132 static gmx_inline void gmx_simdcall
133 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
135 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
139 static gmx_inline void gmx_simdcall
140 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
144 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
145 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
146 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
152 static gmx_inline void gmx_simdcall
153 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
154 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
159 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
160 t3 = _mm256_extractf128_pd(xmm1, 0x1);
161 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
162 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
163 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
164 _mm_store_sd(ptrC, t3);
165 _mm_store_sd(ptrD, t4);
171 static gmx_inline void gmx_simdcall
172 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
176 t1 = _mm256_castpd256_pd128(xmm1);
177 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
179 _mm_store_sd(ptrA, t1);
183 static gmx_inline void gmx_simdcall
184 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
188 t1 = _mm256_castpd256_pd128(xmm1);
189 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
191 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
192 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
194 _mm_store_sd(ptrA, t1);
195 _mm_store_sd(ptrB, t2);
199 static gmx_inline void gmx_simdcall
200 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
201 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
203 __m128d t1, t2, t3, t4;
205 t1 = _mm256_castpd256_pd128(xmm1);
206 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
207 t3 = _mm256_extractf128_pd(xmm1, 0x1);
208 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
210 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
211 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
212 t3 = _mm_add_sd(t3, _mm_load_sd(ptrC));
213 t4 = _mm_add_sd(t4, _mm_load_sd(ptrD));
215 _mm_store_sd(ptrA, t1);
216 _mm_store_sd(ptrB, t2);
217 _mm_store_sd(ptrC, t3);
218 _mm_store_sd(ptrD, t4);
223 static gmx_inline void gmx_simdcall
224 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
226 *c6 = _mm256_castpd128_pd256(_mm_load_sd(p1));
227 *c12 = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
231 static gmx_inline void gmx_simdcall
232 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
236 t1 = _mm_loadu_pd(p1);
237 t2 = _mm_loadu_pd(p2);
238 *c6 = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
239 *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
244 static gmx_inline void gmx_simdcall
245 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
246 const double * gmx_restrict p3, const double * gmx_restrict p4,
247 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
251 t1 = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
252 t2 = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
254 *c6 = _mm256_unpacklo_pd(t1, t2); /* c6d c6c | c6b c6a */
255 *c12 = _mm256_unpackhi_pd(t1, t2); /* c12d c12c | c12b c12a */
259 static gmx_inline void gmx_simdcall
260 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
261 const double * gmx_restrict xyz,
262 __m256d * gmx_restrict x1,
263 __m256d * gmx_restrict y1,
264 __m256d * gmx_restrict z1)
266 __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
268 mem_xy = _mm_loadu_pd(xyz);
269 mem_z = _mm_load_sd(xyz+2);
270 mem_sxy = _mm_loadu_pd(xyz_shift);
271 mem_sz = _mm_load_sd(xyz_shift+2);
273 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
274 mem_z = _mm_add_pd(mem_z, mem_sz);
276 tx = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
277 ty = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
278 tz = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
280 *x1 = gmx_mm256_set_m128d(tx, tx);
281 *y1 = gmx_mm256_set_m128d(ty, ty);
282 *z1 = gmx_mm256_set_m128d(tz, tz);
286 static gmx_inline void gmx_simdcall
287 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
288 const double * gmx_restrict xyz,
289 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
290 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
291 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
293 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
295 t1 = _mm_loadu_pd(xyz);
296 t2 = _mm_loadu_pd(xyz+2);
297 t3 = _mm_loadu_pd(xyz+4);
298 t4 = _mm_loadu_pd(xyz+6);
299 t5 = _mm_load_sd(xyz+8);
301 sxy = _mm_loadu_pd(xyz_shift);
302 sz = _mm_load_sd(xyz_shift+2);
303 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
304 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
306 t1 = _mm_add_pd(t1, sxy);
307 t2 = _mm_add_pd(t2, szx);
308 t3 = _mm_add_pd(t3, syz);
309 t4 = _mm_add_pd(t4, sxy);
310 t5 = _mm_add_sd(t5, sz);
312 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
313 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
314 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
315 *x1 = gmx_mm256_set_m128d(tx, tx);
316 *y1 = gmx_mm256_set_m128d(ty, ty);
317 *z1 = gmx_mm256_set_m128d(tz, tz);
318 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
319 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
320 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
321 *x2 = gmx_mm256_set_m128d(tx, tx);
322 *y2 = gmx_mm256_set_m128d(ty, ty);
323 *z2 = gmx_mm256_set_m128d(tz, tz);
324 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
325 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
326 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
327 *x3 = gmx_mm256_set_m128d(tx, tx);
328 *y3 = gmx_mm256_set_m128d(ty, ty);
329 *z3 = gmx_mm256_set_m128d(tz, tz);
333 static gmx_inline void gmx_simdcall
334 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
335 const double * gmx_restrict xyz,
336 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
337 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
338 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
339 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
341 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
343 t1 = _mm_loadu_pd(xyz);
344 t2 = _mm_loadu_pd(xyz+2);
345 t3 = _mm_loadu_pd(xyz+4);
346 t4 = _mm_loadu_pd(xyz+6);
347 t5 = _mm_loadu_pd(xyz+8);
348 t6 = _mm_loadu_pd(xyz+10);
350 sxy = _mm_loadu_pd(xyz_shift);
351 sz = _mm_load_sd(xyz_shift+2);
352 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
353 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
355 t1 = _mm_add_pd(t1, sxy);
356 t2 = _mm_add_pd(t2, szx);
357 t3 = _mm_add_pd(t3, syz);
358 t4 = _mm_add_pd(t4, sxy);
359 t5 = _mm_add_pd(t5, szx);
360 t6 = _mm_add_pd(t6, syz);
362 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
363 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
364 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
365 *x1 = gmx_mm256_set_m128d(tx, tx);
366 *y1 = gmx_mm256_set_m128d(ty, ty);
367 *z1 = gmx_mm256_set_m128d(tz, tz);
368 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
369 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
370 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
371 *x2 = gmx_mm256_set_m128d(tx, tx);
372 *y2 = gmx_mm256_set_m128d(ty, ty);
373 *z2 = gmx_mm256_set_m128d(tz, tz);
374 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
375 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
376 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
377 *x3 = gmx_mm256_set_m128d(tx, tx);
378 *y3 = gmx_mm256_set_m128d(ty, ty);
379 *z3 = gmx_mm256_set_m128d(tz, tz);
380 tx = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
381 ty = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
382 tz = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
383 *x4 = gmx_mm256_set_m128d(tx, tx);
384 *y4 = gmx_mm256_set_m128d(ty, ty);
385 *z4 = gmx_mm256_set_m128d(tz, tz);
389 static gmx_inline void gmx_simdcall
390 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
391 __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
395 t1 = _mm256_loadu_pd(p1);
397 *y = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
398 *z = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
402 static gmx_inline void gmx_simdcall
403 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
404 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
405 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
406 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
408 __m256d t1, t2, t3, t4;
410 t1 = _mm256_loadu_pd(p1);
411 t3 = _mm256_loadu_pd(p1+4);
414 t2 = gmx_mm256_unpack128hi_pd(t1, t1);
415 t4 = gmx_mm256_unpack128hi_pd(t3, t3);
418 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
419 *z2 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
420 *x2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
421 *y3 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
422 *z3 = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
425 static gmx_inline void gmx_simdcall
426 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
427 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
428 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
429 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
430 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
432 __m256d t1, t2, t3, t4, t5, t6;
434 t1 = _mm256_loadu_pd(p1);
435 t2 = _mm256_loadu_pd(p1+4);
436 t3 = _mm256_loadu_pd(p1+8);
438 t4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
439 t5 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
440 t6 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
449 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
450 *z2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
451 *x4 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
452 *x2 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
453 *y3 = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
454 *z4 = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
458 static gmx_inline void gmx_simdcall
459 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
460 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
461 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
463 __m256d t1, t2, t3, t4, t5, t6;
465 t1 = _mm256_loadu_pd(ptrA); /* - z1a | y1a x1a */
466 t2 = _mm256_loadu_pd(ptrB); /* - z1b | y1b x1b */
467 t3 = _mm256_loadu_pd(ptrC); /* - z1c | y1c x1c */
468 t4 = _mm256_loadu_pd(ptrD); /* - z1d | y1d x1d */
470 t5 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
471 t6 = _mm256_unpackhi_pd(t1, t2); /* - - | y1b y1a */
472 t1 = _mm256_unpacklo_pd(t3, t4); /* z1c z1c | x1d x1c */
473 t2 = _mm256_unpackhi_pd(t3, t4); /* - - | y1d y1c */
475 *x1 = gmx_mm256_unpack128lo_pd(t5, t1);
476 *y1 = gmx_mm256_unpack128lo_pd(t6, t2);
477 *z1 = gmx_mm256_unpack128hi_pd(t5, t1);
482 static gmx_inline void gmx_simdcall
483 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
484 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
485 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
486 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
487 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
489 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
491 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
492 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
493 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
494 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
495 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
496 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
497 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
498 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
499 t9 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
500 t10 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
501 t11 = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /* - - | - z3c */
502 t12 = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /* - - | - z3d */
504 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
505 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
506 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
507 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
509 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
510 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
511 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
512 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
514 t9 = _mm256_unpacklo_pd(t9, t10); /* - - | z3b z3a */
515 t11 = _mm256_unpacklo_pd(t11, t12); /* - - | z3d z3c */
517 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
518 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
519 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
520 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
521 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
522 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
523 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
524 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
525 *z3 = gmx_mm256_unpack128lo_pd(t9, t11);
530 static gmx_inline void gmx_simdcall
531 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
532 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
533 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
534 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
535 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
536 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
538 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
540 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
541 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
542 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
543 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
544 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
545 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
546 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
547 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
548 t9 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
549 t10 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
550 t11 = _mm256_loadu_pd(ptrC+8); /* z4c y4c | x4c z3c */
551 t12 = _mm256_loadu_pd(ptrD+8); /* z4d y4d | x4d z3d */
553 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
554 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
555 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
556 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
558 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
559 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
560 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
561 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
563 t7 = _mm256_unpacklo_pd(t9, t10); /* y4b y4a | z3b z3a */
564 t8 = _mm256_unpackhi_pd(t9, t10); /* z4b z4a | x4b x4a */
565 t9 = _mm256_unpacklo_pd(t11, t12); /* y4d y4c | z3d z3c */
566 t10 = _mm256_unpackhi_pd(t11, t12); /* z4d z4c | x4d x4c */
568 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
569 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
570 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
571 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
572 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
573 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
574 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
575 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
576 *z3 = gmx_mm256_unpack128lo_pd(t7, t9);
577 *x4 = gmx_mm256_unpack128lo_pd(t8, t10);
578 *y4 = gmx_mm256_unpack128hi_pd(t7, t9);
579 *z4 = gmx_mm256_unpack128hi_pd(t8, t10);
584 static gmx_inline void gmx_simdcall
585 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
586 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
587 __m256d x1, __m256d y1, __m256d z1)
589 __m256d t1, t2, tA, tB, tC, tD;
592 t1 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
593 t2 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
594 x1 = gmx_mm256_unpack128lo_pd(t1, z1); /* - z1a | y1a x1a */
595 y1 = gmx_mm256_unpack128hi_pd(t1, z1); /* - z1c | y1c x1c */
596 z1 = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
597 t1 = gmx_mm256_unpack128lo_pd(t2, z1); /* - z1b | y1b x1b */
598 z1 = gmx_mm256_unpack128hi_pd(t2, z1); /* - z1d | y1d x1d */
600 /* Construct a mask without executing any data loads */
601 mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
602 _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
604 tA = _mm256_loadu_pd(ptrA);
605 tB = _mm256_loadu_pd(ptrB);
606 tC = _mm256_loadu_pd(ptrC);
607 tD = _mm256_loadu_pd(ptrD);
609 tA = _mm256_sub_pd(tA, x1);
610 tB = _mm256_sub_pd(tB, t1);
611 tC = _mm256_sub_pd(tC, y1);
612 tD = _mm256_sub_pd(tD, z1);
614 _mm256_maskstore_pd(ptrA, mask, tA);
615 _mm256_maskstore_pd(ptrB, mask, tB);
616 _mm256_maskstore_pd(ptrC, mask, tC);
617 _mm256_maskstore_pd(ptrD, mask, tD);
623 static gmx_inline void gmx_simdcall
624 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
625 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
626 __m256d x1, __m256d y1, __m256d z1,
627 __m256d x2, __m256d y2, __m256d z2,
628 __m256d x3, __m256d y3, __m256d z3)
630 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
631 __m128d tA, tB, tC, tD, tE;
633 t1 = _mm256_loadu_pd(ptrA);
634 t2 = _mm256_loadu_pd(ptrB);
635 t3 = _mm256_loadu_pd(ptrC);
636 t4 = _mm256_loadu_pd(ptrD);
637 t5 = _mm256_loadu_pd(ptrA+4);
638 t6 = _mm256_loadu_pd(ptrB+4);
639 t7 = _mm256_loadu_pd(ptrC+4);
640 t8 = _mm256_loadu_pd(ptrD+4);
641 tA = _mm_load_sd(ptrA+8);
642 tB = _mm_load_sd(ptrB+8);
643 tC = _mm_load_sd(ptrC+8);
644 tD = _mm_load_sd(ptrD+8);
646 t9 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
647 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
649 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
650 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
652 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
653 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
655 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
656 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
658 t10 = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
659 y3 = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
661 t9 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
662 y1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
664 x1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
665 z1 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
667 x2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
668 z2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
670 t1 = _mm256_sub_pd(t1, t10);
671 t2 = _mm256_sub_pd(t2, t9);
672 t3 = _mm256_sub_pd(t3, y3);
673 t4 = _mm256_sub_pd(t4, y1);
674 t5 = _mm256_sub_pd(t5, x1);
675 t6 = _mm256_sub_pd(t6, x2);
676 t7 = _mm256_sub_pd(t7, z1);
677 t8 = _mm256_sub_pd(t8, z2);
679 tA = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
680 tB = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
681 tE = _mm256_extractf128_pd(z3, 0x1);
682 tC = _mm_sub_sd(tC, tE);
683 tD = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
685 /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
686 _mm256_storeu_pd(ptrA, t1);
687 _mm256_storeu_pd(ptrB, t2);
688 _mm256_storeu_pd(ptrC, t3);
689 _mm256_storeu_pd(ptrD, t4);
690 _mm256_storeu_pd(ptrA+4, t5);
691 _mm256_storeu_pd(ptrB+4, t6);
692 _mm256_storeu_pd(ptrC+4, t7);
693 _mm256_storeu_pd(ptrD+4, t8);
694 _mm_store_sd(ptrA+8, tA);
695 _mm_store_sd(ptrB+8, tB);
696 _mm_store_sd(ptrC+8, tC);
697 _mm_store_sd(ptrD+8, tD);
701 static gmx_inline void gmx_simdcall
702 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
703 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
704 __m256d x1, __m256d y1, __m256d z1,
705 __m256d x2, __m256d y2, __m256d z2,
706 __m256d x3, __m256d y3, __m256d z3,
707 __m256d x4, __m256d y4, __m256d z4)
709 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
710 __m128d tA, tB, tC, tD, tE;
712 t1 = _mm256_loadu_pd(ptrA);
713 t2 = _mm256_loadu_pd(ptrB);
714 t3 = _mm256_loadu_pd(ptrC);
715 t4 = _mm256_loadu_pd(ptrD);
716 t5 = _mm256_loadu_pd(ptrA+4);
717 t6 = _mm256_loadu_pd(ptrB+4);
718 t7 = _mm256_loadu_pd(ptrC+4);
719 t8 = _mm256_loadu_pd(ptrD+4);
720 t9 = _mm256_loadu_pd(ptrA+8);
721 t10 = _mm256_loadu_pd(ptrB+8);
722 t11 = _mm256_loadu_pd(ptrC+8);
723 t12 = _mm256_loadu_pd(ptrD+8);
725 t13 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
726 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
727 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
728 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
729 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
730 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
731 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
732 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
733 y3 = _mm256_unpacklo_pd(z3, x4); /* x4c z3c | x4a z3a */
734 z3 = _mm256_unpackhi_pd(z3, x4); /* x4d z3d | x4b z3b */
735 x4 = _mm256_unpacklo_pd(y4, z4); /* z4c y4c | z4a y4a */
736 y4 = _mm256_unpackhi_pd(y4, z4); /* z4d y4d | z4b y4b */
738 z4 = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
739 t13 = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
740 y1 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
741 x1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
742 z1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
743 x2 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
744 z2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
745 y2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
746 x3 = gmx_mm256_unpack128lo_pd(y3, x4); /* z4a y4a | x4a z3a */
747 y3 = gmx_mm256_unpack128hi_pd(y3, x4); /* z4c y4c | x4c z3c */
748 x4 = gmx_mm256_unpack128lo_pd(z3, y4); /* z4b y4b | x4b z3b */
749 z3 = gmx_mm256_unpack128hi_pd(z3, y4); /* z4d y4d | x4d z3d */
751 t1 = _mm256_sub_pd(t1, z4);
752 t2 = _mm256_sub_pd(t2, y1);
753 t3 = _mm256_sub_pd(t3, t13);
754 t4 = _mm256_sub_pd(t4, x1);
755 t5 = _mm256_sub_pd(t5, z1);
756 t6 = _mm256_sub_pd(t6, z2);
757 t7 = _mm256_sub_pd(t7, x2);
758 t8 = _mm256_sub_pd(t8, y2);
759 t9 = _mm256_sub_pd(t9, x3);
760 t10 = _mm256_sub_pd(t10, x4);
761 t11 = _mm256_sub_pd(t11, y3);
762 t12 = _mm256_sub_pd(t12, z3);
764 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
765 _mm256_storeu_pd(ptrA, t1);
766 _mm256_storeu_pd(ptrB, t2);
767 _mm256_storeu_pd(ptrC, t3);
768 _mm256_storeu_pd(ptrD, t4);
769 _mm256_storeu_pd(ptrA+4, t5);
770 _mm256_storeu_pd(ptrB+4, t6);
771 _mm256_storeu_pd(ptrC+4, t7);
772 _mm256_storeu_pd(ptrD+4, t8);
773 _mm256_storeu_pd(ptrA+8, t9);
774 _mm256_storeu_pd(ptrB+8, t10);
775 _mm256_storeu_pd(ptrC+8, t11);
776 _mm256_storeu_pd(ptrD+8, t12);
781 static gmx_inline void gmx_simdcall
782 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
783 double * gmx_restrict fptr,
784 double * gmx_restrict fshiftptr)
788 fix1 = _mm256_hadd_pd(fix1, fiy1);
789 fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
791 /* Add across the two lanes */
792 tA = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
793 tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
795 fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
797 t1 = _mm256_loadu_pd(fptr);
798 t2 = _mm256_loadu_pd(fshiftptr);
800 t1 = _mm256_add_pd(t1, fix1);
801 t2 = _mm256_add_pd(t2, fix1);
803 _mm256_storeu_pd(fptr, t1);
804 _mm256_storeu_pd(fshiftptr, t2);
810 static gmx_inline void gmx_simdcall
811 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
812 __m256d fix2, __m256d fiy2, __m256d fiz2,
813 __m256d fix3, __m256d fiy3, __m256d fiz3,
814 double * gmx_restrict fptr,
815 double * gmx_restrict fshiftptr)
817 __m256d t1, t2, t3, t4;
818 __m128d tz3, tA, tB, tC, tD;
820 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
821 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
822 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
823 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
824 fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /* 0 Z3c-d | 0 Z3a-b */
826 /* Add across the two lanes by swapping and adding back */
827 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
828 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
829 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
831 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
832 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
833 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
835 tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
837 t2 = _mm256_loadu_pd(fptr);
838 t4 = _mm256_loadu_pd(fptr+4);
839 tA = _mm_load_sd(fptr+8);
841 t2 = _mm256_add_pd(t2, t1);
842 t4 = _mm256_add_pd(t4, t3);
843 tA = _mm_add_sd(tA, tz3);
845 _mm256_storeu_pd(fptr, t2);
846 _mm256_storeu_pd(fptr+4, t4);
847 _mm_store_sd(fptr+8, tA);
849 /* Add up shift force */
850 /* t1: x2 z1 | y1 x1 */
851 /* t3: y3 x3 | z2 y2 */
855 tB = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
856 tC = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
857 tz3 = _mm_add_sd(tz3, tB); /* 0 z1+z3 */
858 tD = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
859 tz3 = _mm_add_sd(tz3, tD); /* - z */
861 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
863 tD = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
864 tC = _mm_add_pd(tC, tD); /* y x */
866 tA = _mm_loadu_pd(fshiftptr);
867 tB = _mm_load_sd(fshiftptr+2);
868 tA = _mm_add_pd(tA, tC);
869 tB = _mm_add_sd(tB, tz3);
870 _mm_storeu_pd(fshiftptr, tA);
871 _mm_store_sd(fshiftptr+2, tB);
875 static gmx_inline void gmx_simdcall
876 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
877 __m256d fix2, __m256d fiy2, __m256d fiz2,
878 __m256d fix3, __m256d fiy3, __m256d fiz3,
879 __m256d fix4, __m256d fiy4, __m256d fiz4,
880 double * gmx_restrict fptr,
881 double * gmx_restrict fshiftptr)
883 __m256d t1, t2, t3, t4, t5, t6;
884 __m128d tA, tB, tC, tD;
886 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
887 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
888 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
889 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
890 fiz3 = _mm256_hadd_pd(fiz3, fix4); /* X4c-d Z3c-d | X4a-b Z3a-b */
891 fiy4 = _mm256_hadd_pd(fiy4, fiz4); /* Z4c-d Y4c-d | Z4a-b Y4a-b */
893 /* Add across the two lanes by swapping and adding back */
894 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
895 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
896 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
898 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
899 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
900 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
902 t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
903 t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
904 t5 = _mm256_add_pd(t5, t6); /* z4 y4 | x4 z3 */
906 t2 = _mm256_loadu_pd(fptr);
907 t4 = _mm256_loadu_pd(fptr+4);
908 t6 = _mm256_loadu_pd(fptr+8);
910 t2 = _mm256_add_pd(t2, t1);
911 t4 = _mm256_add_pd(t4, t3);
912 t6 = _mm256_add_pd(t6, t5);
914 _mm256_storeu_pd(fptr, t2);
915 _mm256_storeu_pd(fptr+4, t4);
916 _mm256_storeu_pd(fptr+8, t6);
918 /* Add up shift force */
919 /* t1: x2. z1. | y1. x1. */
920 /* t3: y3. x3. | z2 y2 */
921 /* t5: z4 y4 | x4. z3. */
924 tA = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
925 tB = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
926 tC = _mm256_extractf128_pd(t5, 0x1); /* z4 y4 */
928 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
929 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t5)); /* x2+x4 z1+z3 */
930 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t3)); /* z4+z2 y4+y2 */
932 tD = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1)); /* y4+y2 x2+x4 */
933 tB = _mm_add_pd(tB, tD); /* y x */
934 tC = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /* - z4+z2 */
935 tC = _mm_add_sd(tC, tA); /* - z */
937 tA = _mm_loadu_pd(fshiftptr);
938 tD = _mm_load_sd(fshiftptr+2);
939 tA = _mm_add_pd(tA, tB);
940 tD = _mm_add_sd(tD, tC);
941 _mm_storeu_pd(fshiftptr, tA);
942 _mm_store_sd(fshiftptr+2, tD);
946 static gmx_inline void gmx_simdcall
947 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
951 pot1 = _mm256_hadd_pd(pot1, pot1);
953 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
955 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
958 static gmx_inline void gmx_simdcall
959 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
960 __m256d pot2, double * gmx_restrict ptrB)
964 pot1 = _mm256_hadd_pd(pot1, pot2);
966 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
968 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
969 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
970 _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
974 #endif /* _kernelutil_x86_avx_256_double_h_ */