2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_256_double_h_
36 #define _kernelutil_x86_avx_256_double_h_
40 #define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
42 #define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
43 #define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
44 #define _GMX_MM_PERMUTE128D(fp1, fp0) (((fp1) << 1) | ((fp0)))
45 #define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
46 #define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
48 __m256d _t0, _t1, _t2, _t3; \
49 _t0 = _mm256_unpacklo_pd((row0), (row1)); \
50 _t1 = _mm256_unpackhi_pd((row0), (row1)); \
51 _t2 = _mm256_unpacklo_pd((row2), (row3)); \
52 _t3 = _mm256_unpackhi_pd((row2), (row3)); \
53 row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20); \
54 row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20); \
55 row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31); \
56 row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31); \
59 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
61 static gmx_inline __m256d gmx_simdcall
62 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
64 return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
67 static gmx_inline __m256d gmx_simdcall
68 gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
70 return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
73 static gmx_inline __m256d gmx_simdcall
74 gmx_mm256_set_m128d(__m128d hi, __m128d lo)
76 return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
79 static gmx_inline __m256 gmx_simdcall
80 gmx_mm256_set_m128(__m128 hi, __m128 lo)
82 return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
85 static gmx_inline int gmx_simdcall
86 gmx_mm256_any_lt(__m256d a, __m256d b)
88 return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
91 static gmx_inline __m256d gmx_simdcall
92 gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
94 return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
97 /* Normal sum of four ymm registers */
98 #define gmx_mm256_sum4_pd(t0, t1, t2, t3) _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
101 /* Load a single value from 1-4 places, merge into xmm register */
102 static gmx_inline __m256d gmx_simdcall
103 gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
105 return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
108 static gmx_inline __m256d gmx_simdcall
109 gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
110 const double * gmx_restrict ptrB)
114 tA = _mm_load_sd(ptrA);
115 tB = _mm_load_sd(ptrB);
117 return _mm256_castpd128_pd256(_mm_unpacklo_pd(tA, tB));
121 static gmx_inline __m256d gmx_simdcall
122 gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
123 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
127 t1 = _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
128 t2 = _mm_unpacklo_pd(_mm_load_sd(ptrC), _mm_load_sd(ptrD));
129 return gmx_mm256_set_m128d(t2, t1);
134 static gmx_inline void gmx_simdcall
135 gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
137 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
141 static gmx_inline void gmx_simdcall
142 gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
146 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
147 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
148 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
154 static gmx_inline void gmx_simdcall
155 gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
156 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
161 t2 = _mm256_permute_pd(xmm1, _GMX_MM_PERMUTE256D(1, 1, 1, 1));
162 t3 = _mm256_extractf128_pd(xmm1, 0x1);
163 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
164 _mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
165 _mm_store_sd(ptrB, _mm256_castpd256_pd128(t2));
166 _mm_store_sd(ptrC, t3);
167 _mm_store_sd(ptrD, t4);
173 static gmx_inline void gmx_simdcall
174 gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
178 t1 = _mm256_castpd256_pd128(xmm1);
179 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
181 _mm_store_sd(ptrA, t1);
185 static gmx_inline void gmx_simdcall
186 gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
190 t1 = _mm256_castpd256_pd128(xmm1);
191 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
193 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
194 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
196 _mm_store_sd(ptrA, t1);
197 _mm_store_sd(ptrB, t2);
201 static gmx_inline void gmx_simdcall
202 gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
203 double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
205 __m128d t1, t2, t3, t4;
207 t1 = _mm256_castpd256_pd128(xmm1);
208 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
209 t3 = _mm256_extractf128_pd(xmm1, 0x1);
210 t4 = _mm_permute_pd(t3, _GMX_MM_PERMUTE128D(1, 1));
212 t1 = _mm_add_sd(t1, _mm_load_sd(ptrA));
213 t2 = _mm_add_sd(t2, _mm_load_sd(ptrB));
214 t3 = _mm_add_sd(t3, _mm_load_sd(ptrC));
215 t4 = _mm_add_sd(t4, _mm_load_sd(ptrD));
217 _mm_store_sd(ptrA, t1);
218 _mm_store_sd(ptrB, t2);
219 _mm_store_sd(ptrC, t3);
220 _mm_store_sd(ptrD, t4);
225 static gmx_inline void gmx_simdcall
226 gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
228 *c6 = _mm256_castpd128_pd256(_mm_load_sd(p1));
229 *c12 = _mm256_castpd128_pd256(_mm_load_sd(p1+1));
233 static gmx_inline void gmx_simdcall
234 gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
238 t1 = _mm_loadu_pd(p1);
239 t2 = _mm_loadu_pd(p2);
240 *c6 = _mm256_castpd128_pd256(_mm_unpacklo_pd(t1, t2));
241 *c12 = _mm256_castpd128_pd256(_mm_unpackhi_pd(t1, t2));
246 static gmx_inline void gmx_simdcall
247 gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
248 const double * gmx_restrict p3, const double * gmx_restrict p4,
249 __m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
253 t1 = gmx_mm256_set_m128d(_mm_loadu_pd(p3), _mm_loadu_pd(p1)); /* c12c c6c | c12a c6a */
254 t2 = gmx_mm256_set_m128d(_mm_loadu_pd(p4), _mm_loadu_pd(p2)); /* c12d c6d | c12b c6b */
256 *c6 = _mm256_unpacklo_pd(t1, t2); /* c6d c6c | c6b c6a */
257 *c12 = _mm256_unpackhi_pd(t1, t2); /* c12d c12c | c12b c12a */
261 static gmx_inline void gmx_simdcall
262 gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
263 const double * gmx_restrict xyz,
264 __m256d * gmx_restrict x1,
265 __m256d * gmx_restrict y1,
266 __m256d * gmx_restrict z1)
268 __m128d mem_xy, mem_z, mem_sxy, mem_sz, tx, ty, tz;
270 mem_xy = _mm_loadu_pd(xyz);
271 mem_z = _mm_load_sd(xyz+2);
272 mem_sxy = _mm_loadu_pd(xyz_shift);
273 mem_sz = _mm_load_sd(xyz_shift+2);
275 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
276 mem_z = _mm_add_pd(mem_z, mem_sz);
278 tx = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
279 ty = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
280 tz = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
282 *x1 = gmx_mm256_set_m128d(tx, tx);
283 *y1 = gmx_mm256_set_m128d(ty, ty);
284 *z1 = gmx_mm256_set_m128d(tz, tz);
288 static gmx_inline void gmx_simdcall
289 gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
290 const double * gmx_restrict xyz,
291 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
292 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
293 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
295 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz, tx, ty, tz;
297 t1 = _mm_loadu_pd(xyz);
298 t2 = _mm_loadu_pd(xyz+2);
299 t3 = _mm_loadu_pd(xyz+4);
300 t4 = _mm_loadu_pd(xyz+6);
301 t5 = _mm_load_sd(xyz+8);
303 sxy = _mm_loadu_pd(xyz_shift);
304 sz = _mm_load_sd(xyz_shift+2);
305 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
306 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
308 t1 = _mm_add_pd(t1, sxy);
309 t2 = _mm_add_pd(t2, szx);
310 t3 = _mm_add_pd(t3, syz);
311 t4 = _mm_add_pd(t4, sxy);
312 t5 = _mm_add_sd(t5, sz);
314 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
315 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
316 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
317 *x1 = gmx_mm256_set_m128d(tx, tx);
318 *y1 = gmx_mm256_set_m128d(ty, ty);
319 *z1 = gmx_mm256_set_m128d(tz, tz);
320 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
321 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
322 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
323 *x2 = gmx_mm256_set_m128d(tx, tx);
324 *y2 = gmx_mm256_set_m128d(ty, ty);
325 *z2 = gmx_mm256_set_m128d(tz, tz);
326 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
327 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
328 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
329 *x3 = gmx_mm256_set_m128d(tx, tx);
330 *y3 = gmx_mm256_set_m128d(ty, ty);
331 *z3 = gmx_mm256_set_m128d(tz, tz);
335 static gmx_inline void gmx_simdcall
336 gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
337 const double * gmx_restrict xyz,
338 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
339 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
340 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
341 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
343 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz, tx, ty, tz;
345 t1 = _mm_loadu_pd(xyz);
346 t2 = _mm_loadu_pd(xyz+2);
347 t3 = _mm_loadu_pd(xyz+4);
348 t4 = _mm_loadu_pd(xyz+6);
349 t5 = _mm_loadu_pd(xyz+8);
350 t6 = _mm_loadu_pd(xyz+10);
352 sxy = _mm_loadu_pd(xyz_shift);
353 sz = _mm_load_sd(xyz_shift+2);
354 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
355 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
357 t1 = _mm_add_pd(t1, sxy);
358 t2 = _mm_add_pd(t2, szx);
359 t3 = _mm_add_pd(t3, syz);
360 t4 = _mm_add_pd(t4, sxy);
361 t5 = _mm_add_pd(t5, szx);
362 t6 = _mm_add_pd(t6, syz);
364 tx = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
365 ty = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
366 tz = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
367 *x1 = gmx_mm256_set_m128d(tx, tx);
368 *y1 = gmx_mm256_set_m128d(ty, ty);
369 *z1 = gmx_mm256_set_m128d(tz, tz);
370 tx = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
371 ty = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
372 tz = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
373 *x2 = gmx_mm256_set_m128d(tx, tx);
374 *y2 = gmx_mm256_set_m128d(ty, ty);
375 *z2 = gmx_mm256_set_m128d(tz, tz);
376 tx = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
377 ty = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
378 tz = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
379 *x3 = gmx_mm256_set_m128d(tx, tx);
380 *y3 = gmx_mm256_set_m128d(ty, ty);
381 *z3 = gmx_mm256_set_m128d(tz, tz);
382 tx = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
383 ty = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
384 tz = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
385 *x4 = gmx_mm256_set_m128d(tx, tx);
386 *y4 = gmx_mm256_set_m128d(ty, ty);
387 *z4 = gmx_mm256_set_m128d(tz, tz);
391 static gmx_inline void gmx_simdcall
392 gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
393 __m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
397 t1 = _mm256_loadu_pd(p1);
399 *y = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
400 *z = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
404 static gmx_inline void gmx_simdcall
405 gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
406 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
407 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
408 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
410 __m256d t1, t2, t3, t4;
412 t1 = _mm256_loadu_pd(p1);
413 t3 = _mm256_loadu_pd(p1+4);
416 t2 = gmx_mm256_unpack128hi_pd(t1, t1);
417 t4 = gmx_mm256_unpack128hi_pd(t3, t3);
420 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
421 *z2 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
422 *x2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
423 *y3 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
424 *z3 = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
427 static gmx_inline void gmx_simdcall
428 gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
429 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
430 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
431 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
432 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
434 __m256d t1, t2, t3, t4, t5, t6;
436 t1 = _mm256_loadu_pd(p1);
437 t2 = _mm256_loadu_pd(p1+4);
438 t3 = _mm256_loadu_pd(p1+8);
440 t4 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t1, 0x1));
441 t5 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t2, 0x1));
442 t6 = _mm256_castpd128_pd256(_mm256_extractf128_pd(t3, 0x1));
451 *y1 = _mm256_permute_pd(t1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
452 *z2 = _mm256_permute_pd(t2, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
453 *x4 = _mm256_permute_pd(t3, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
454 *x2 = _mm256_permute_pd(t4, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
455 *y3 = _mm256_permute_pd(t5, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
456 *z4 = _mm256_permute_pd(t6, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
460 static gmx_inline void gmx_simdcall
461 gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
462 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
463 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
465 __m256d t1, t2, t3, t4, t5, t6;
467 t1 = _mm256_loadu_pd(ptrA); /* - z1a | y1a x1a */
468 t2 = _mm256_loadu_pd(ptrB); /* - z1b | y1b x1b */
469 t3 = _mm256_loadu_pd(ptrC); /* - z1c | y1c x1c */
470 t4 = _mm256_loadu_pd(ptrD); /* - z1d | y1d x1d */
472 t5 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
473 t6 = _mm256_unpackhi_pd(t1, t2); /* - - | y1b y1a */
474 t1 = _mm256_unpacklo_pd(t3, t4); /* z1c z1c | x1d x1c */
475 t2 = _mm256_unpackhi_pd(t3, t4); /* - - | y1d y1c */
477 *x1 = gmx_mm256_unpack128lo_pd(t5, t1);
478 *y1 = gmx_mm256_unpack128lo_pd(t6, t2);
479 *z1 = gmx_mm256_unpack128hi_pd(t5, t1);
484 static gmx_inline void gmx_simdcall
485 gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
486 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
487 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
488 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
489 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3)
491 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
493 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
494 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
495 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
496 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
497 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
498 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
499 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
500 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
501 t9 = _mm256_castpd128_pd256(_mm_load_sd(ptrA+8)); /* - - | - z3a */
502 t10 = _mm256_castpd128_pd256(_mm_load_sd(ptrB+8)); /* - - | - z3b */
503 t11 = _mm256_castpd128_pd256(_mm_load_sd(ptrC+8)); /* - - | - z3c */
504 t12 = _mm256_castpd128_pd256(_mm_load_sd(ptrD+8)); /* - - | - z3d */
506 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
507 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
508 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
509 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
511 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
512 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
513 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
514 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
516 t9 = _mm256_unpacklo_pd(t9, t10); /* - - | z3b z3a */
517 t11 = _mm256_unpacklo_pd(t11, t12); /* - - | z3d z3c */
519 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
520 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
521 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
522 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
523 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
524 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
525 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
526 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
527 *z3 = gmx_mm256_unpack128lo_pd(t9, t11);
532 static gmx_inline void gmx_simdcall
533 gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
534 const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
535 __m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
536 __m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
537 __m256d * gmx_restrict x3, __m256d * gmx_restrict y3, __m256d * gmx_restrict z3,
538 __m256d * gmx_restrict x4, __m256d * gmx_restrict y4, __m256d * gmx_restrict z4)
540 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
542 t1 = _mm256_loadu_pd(ptrA); /* x2a z1a | y1a x1a */
543 t2 = _mm256_loadu_pd(ptrB); /* x2b z1b | y1b x1b */
544 t3 = _mm256_loadu_pd(ptrC); /* x2c z1c | y1c x1c */
545 t4 = _mm256_loadu_pd(ptrD); /* x2d z1d | y1d x1d */
546 t5 = _mm256_loadu_pd(ptrA+4); /* y3a x3a | z2a y2a */
547 t6 = _mm256_loadu_pd(ptrB+4); /* y3b x3b | z2b y2b */
548 t7 = _mm256_loadu_pd(ptrC+4); /* y3c x3c | z2c y2c */
549 t8 = _mm256_loadu_pd(ptrD+4); /* y3d x3d | z2d y2d */
550 t9 = _mm256_loadu_pd(ptrA+8); /* z4a y4a | x4a z3a */
551 t10 = _mm256_loadu_pd(ptrB+8); /* z4b y4b | x4b z3b */
552 t11 = _mm256_loadu_pd(ptrC+8); /* z4c y4c | x4c z3c */
553 t12 = _mm256_loadu_pd(ptrD+8); /* z4d y4d | x4d z3d */
555 t13 = _mm256_unpacklo_pd(t1, t2); /* z1b z1a | x1b x1a */
556 t14 = _mm256_unpackhi_pd(t1, t2); /* x2b x2a | y1b y1a */
557 t1 = _mm256_unpacklo_pd(t3, t4); /* z1d z1c | x1d x1c */
558 t2 = _mm256_unpackhi_pd(t3, t4); /* x2d x2c | y1d y1c */
560 t3 = _mm256_unpacklo_pd(t5, t6); /* x3b x3a | y2b y2a */
561 t4 = _mm256_unpackhi_pd(t5, t6); /* y3b y3a | z2b z2a */
562 t5 = _mm256_unpacklo_pd(t7, t8); /* x3d x3c | y2d y2c */
563 t6 = _mm256_unpackhi_pd(t7, t8); /* y3d y3c | z2d z2c */
565 t7 = _mm256_unpacklo_pd(t9, t10); /* y4b y4a | z3b z3a */
566 t8 = _mm256_unpackhi_pd(t9, t10); /* z4b z4a | x4b x4a */
567 t9 = _mm256_unpacklo_pd(t11, t12); /* y4d y4c | z3d z3c */
568 t10 = _mm256_unpackhi_pd(t11, t12); /* z4d z4c | x4d x4c */
570 *x1 = gmx_mm256_unpack128lo_pd(t13, t1);
571 *y1 = gmx_mm256_unpack128lo_pd(t14, t2);
572 *z1 = gmx_mm256_unpack128hi_pd(t13, t1);
573 *x2 = gmx_mm256_unpack128hi_pd(t14, t2);
574 *y2 = gmx_mm256_unpack128lo_pd(t3, t5);
575 *z2 = gmx_mm256_unpack128lo_pd(t4, t6);
576 *x3 = gmx_mm256_unpack128hi_pd(t3, t5);
577 *y3 = gmx_mm256_unpack128hi_pd(t4, t6);
578 *z3 = gmx_mm256_unpack128lo_pd(t7, t9);
579 *x4 = gmx_mm256_unpack128lo_pd(t8, t10);
580 *y4 = gmx_mm256_unpack128hi_pd(t7, t9);
581 *z4 = gmx_mm256_unpack128hi_pd(t8, t10);
586 static gmx_inline void gmx_simdcall
587 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
588 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
589 __m256d x1, __m256d y1, __m256d z1)
591 __m256d t1, t2, tA, tB, tC, tD;
594 t1 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
595 t2 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
596 x1 = gmx_mm256_unpack128lo_pd(t1, z1); /* - z1a | y1a x1a */
597 y1 = gmx_mm256_unpack128hi_pd(t1, z1); /* - z1c | y1c x1c */
598 z1 = _mm256_permute_pd(z1, _GMX_MM_PERMUTE256D(0, 1, 0, 1));
599 t1 = gmx_mm256_unpack128lo_pd(t2, z1); /* - z1b | y1b x1b */
600 z1 = gmx_mm256_unpack128hi_pd(t2, z1); /* - z1d | y1d x1d */
602 /* Construct a mask without executing any data loads */
603 mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
604 _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ), 0x7));
606 tA = _mm256_loadu_pd(ptrA);
607 tB = _mm256_loadu_pd(ptrB);
608 tC = _mm256_loadu_pd(ptrC);
609 tD = _mm256_loadu_pd(ptrD);
611 tA = _mm256_sub_pd(tA, x1);
612 tB = _mm256_sub_pd(tB, t1);
613 tC = _mm256_sub_pd(tC, y1);
614 tD = _mm256_sub_pd(tD, z1);
616 _mm256_maskstore_pd(ptrA, mask, tA);
617 _mm256_maskstore_pd(ptrB, mask, tB);
618 _mm256_maskstore_pd(ptrC, mask, tC);
619 _mm256_maskstore_pd(ptrD, mask, tD);
625 static gmx_inline void gmx_simdcall
626 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
627 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
628 __m256d x1, __m256d y1, __m256d z1,
629 __m256d x2, __m256d y2, __m256d z2,
630 __m256d x3, __m256d y3, __m256d z3)
632 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
633 __m128d tA, tB, tC, tD, tE;
635 t1 = _mm256_loadu_pd(ptrA);
636 t2 = _mm256_loadu_pd(ptrB);
637 t3 = _mm256_loadu_pd(ptrC);
638 t4 = _mm256_loadu_pd(ptrD);
639 t5 = _mm256_loadu_pd(ptrA+4);
640 t6 = _mm256_loadu_pd(ptrB+4);
641 t7 = _mm256_loadu_pd(ptrC+4);
642 t8 = _mm256_loadu_pd(ptrD+4);
643 tA = _mm_load_sd(ptrA+8);
644 tB = _mm_load_sd(ptrB+8);
645 tC = _mm_load_sd(ptrC+8);
646 tD = _mm_load_sd(ptrD+8);
648 t9 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
649 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
651 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
652 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
654 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
655 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
657 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
658 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
660 t10 = gmx_mm256_unpack128lo_pd(t9, y1); /* x2a z1a | y1a x1a */
661 y3 = gmx_mm256_unpack128hi_pd(t9, y1); /* x2c z1c | y1c x1c */
663 t9 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
664 y1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
666 x1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
667 z1 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
669 x2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
670 z2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
672 t1 = _mm256_sub_pd(t1, t10);
673 t2 = _mm256_sub_pd(t2, t9);
674 t3 = _mm256_sub_pd(t3, y3);
675 t4 = _mm256_sub_pd(t4, y1);
676 t5 = _mm256_sub_pd(t5, x1);
677 t6 = _mm256_sub_pd(t6, x2);
678 t7 = _mm256_sub_pd(t7, z1);
679 t8 = _mm256_sub_pd(t8, z2);
681 tA = _mm_sub_sd(tA, _mm256_castpd256_pd128(z3));
682 tB = _mm_sub_sd(tB, _mm_permute_pd(_mm256_castpd256_pd128(z3), _GMX_MM_PERMUTE128D(1, 1)));
683 tE = _mm256_extractf128_pd(z3, 0x1);
684 tC = _mm_sub_sd(tC, tE);
685 tD = _mm_sub_sd(tD, _mm_permute_pd(tE, _GMX_MM_PERMUTE128D(1, 1)));
687 /* Here we store a full 256-bit value and a separate 64-bit one; no overlap can happen */
688 _mm256_storeu_pd(ptrA, t1);
689 _mm256_storeu_pd(ptrB, t2);
690 _mm256_storeu_pd(ptrC, t3);
691 _mm256_storeu_pd(ptrD, t4);
692 _mm256_storeu_pd(ptrA+4, t5);
693 _mm256_storeu_pd(ptrB+4, t6);
694 _mm256_storeu_pd(ptrC+4, t7);
695 _mm256_storeu_pd(ptrD+4, t8);
696 _mm_store_sd(ptrA+8, tA);
697 _mm_store_sd(ptrB+8, tB);
698 _mm_store_sd(ptrC+8, tC);
699 _mm_store_sd(ptrD+8, tD);
703 static gmx_inline void gmx_simdcall
704 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
705 double * gmx_restrict ptrC, double * gmx_restrict ptrD,
706 __m256d x1, __m256d y1, __m256d z1,
707 __m256d x2, __m256d y2, __m256d z2,
708 __m256d x3, __m256d y3, __m256d z3,
709 __m256d x4, __m256d y4, __m256d z4)
711 __m256d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
712 __m128d tA, tB, tC, tD, tE;
714 t1 = _mm256_loadu_pd(ptrA);
715 t2 = _mm256_loadu_pd(ptrB);
716 t3 = _mm256_loadu_pd(ptrC);
717 t4 = _mm256_loadu_pd(ptrD);
718 t5 = _mm256_loadu_pd(ptrA+4);
719 t6 = _mm256_loadu_pd(ptrB+4);
720 t7 = _mm256_loadu_pd(ptrC+4);
721 t8 = _mm256_loadu_pd(ptrD+4);
722 t9 = _mm256_loadu_pd(ptrA+8);
723 t10 = _mm256_loadu_pd(ptrB+8);
724 t11 = _mm256_loadu_pd(ptrC+8);
725 t12 = _mm256_loadu_pd(ptrD+8);
727 t13 = _mm256_unpacklo_pd(x1, y1); /* y1c x1c | y1a x1a */
728 x1 = _mm256_unpackhi_pd(x1, y1); /* y1d x1d | y1b x1b */
729 y1 = _mm256_unpacklo_pd(z1, x2); /* x2c z1c | x2a z1a */
730 z1 = _mm256_unpackhi_pd(z1, x2); /* x2d z1d | x2b z1b */
731 x2 = _mm256_unpacklo_pd(y2, z2); /* z2c y2c | z2a y2a */
732 y2 = _mm256_unpackhi_pd(y2, z2); /* z2d y2d | z2b y2b */
733 z2 = _mm256_unpacklo_pd(x3, y3); /* y3c x3c | y3a x3a */
734 x3 = _mm256_unpackhi_pd(x3, y3); /* y3d x3d | y3b x3b */
735 y3 = _mm256_unpacklo_pd(z3, x4); /* x4c z3c | x4a z3a */
736 z3 = _mm256_unpackhi_pd(z3, x4); /* x4d z3d | x4b z3b */
737 x4 = _mm256_unpacklo_pd(y4, z4); /* z4c y4c | z4a y4a */
738 y4 = _mm256_unpackhi_pd(y4, z4); /* z4d y4d | z4b y4b */
740 z4 = gmx_mm256_unpack128lo_pd(t13, y1); /* x2a z1a | y1a x1a */
741 t13 = gmx_mm256_unpack128hi_pd(t13, y1); /* x2c z1c | y1c x1c */
742 y1 = gmx_mm256_unpack128lo_pd(x1, z1); /* x2b z1b | y1b x1b */
743 x1 = gmx_mm256_unpack128hi_pd(x1, z1); /* x2d z1d | y1d x1d */
744 z1 = gmx_mm256_unpack128lo_pd(x2, z2); /* y3a x3a | z2a y2a */
745 x2 = gmx_mm256_unpack128hi_pd(x2, z2); /* y3c x3c | z2c y2c */
746 z2 = gmx_mm256_unpack128lo_pd(y2, x3); /* y3b x3b | z2b y2b */
747 y2 = gmx_mm256_unpack128hi_pd(y2, x3); /* y3d x3d | z2d y2d */
748 x3 = gmx_mm256_unpack128lo_pd(y3, x4); /* z4a y4a | x4a z3a */
749 y3 = gmx_mm256_unpack128hi_pd(y3, x4); /* z4c y4c | x4c z3c */
750 x4 = gmx_mm256_unpack128lo_pd(z3, y4); /* z4b y4b | x4b z3b */
751 z3 = gmx_mm256_unpack128hi_pd(z3, y4); /* z4d y4d | x4d z3d */
753 t1 = _mm256_sub_pd(t1, z4);
754 t2 = _mm256_sub_pd(t2, y1);
755 t3 = _mm256_sub_pd(t3, t13);
756 t4 = _mm256_sub_pd(t4, x1);
757 t5 = _mm256_sub_pd(t5, z1);
758 t6 = _mm256_sub_pd(t6, z2);
759 t7 = _mm256_sub_pd(t7, x2);
760 t8 = _mm256_sub_pd(t8, y2);
761 t9 = _mm256_sub_pd(t9, x3);
762 t10 = _mm256_sub_pd(t10, x4);
763 t11 = _mm256_sub_pd(t11, y3);
764 t12 = _mm256_sub_pd(t12, z3);
766 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
767 _mm256_storeu_pd(ptrA, t1);
768 _mm256_storeu_pd(ptrB, t2);
769 _mm256_storeu_pd(ptrC, t3);
770 _mm256_storeu_pd(ptrD, t4);
771 _mm256_storeu_pd(ptrA+4, t5);
772 _mm256_storeu_pd(ptrB+4, t6);
773 _mm256_storeu_pd(ptrC+4, t7);
774 _mm256_storeu_pd(ptrD+4, t8);
775 _mm256_storeu_pd(ptrA+8, t9);
776 _mm256_storeu_pd(ptrB+8, t10);
777 _mm256_storeu_pd(ptrC+8, t11);
778 _mm256_storeu_pd(ptrD+8, t12);
783 static gmx_inline void gmx_simdcall
784 gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
785 double * gmx_restrict fptr,
786 double * gmx_restrict fshiftptr)
790 fix1 = _mm256_hadd_pd(fix1, fiy1);
791 fiz1 = _mm256_hadd_pd(fiz1, _mm256_setzero_pd());
793 /* Add across the two lanes */
794 tA = _mm_add_pd(_mm256_castpd256_pd128(fix1), _mm256_extractf128_pd(fix1, 0x1));
795 tB = _mm_add_pd(_mm256_castpd256_pd128(fiz1), _mm256_extractf128_pd(fiz1, 0x1));
797 fix1 = gmx_mm256_set_m128d(tB, tA); /* 0 fiz fiy fix */
799 t1 = _mm256_loadu_pd(fptr);
800 t2 = _mm256_loadu_pd(fshiftptr);
802 t1 = _mm256_add_pd(t1, fix1);
803 t2 = _mm256_add_pd(t2, fix1);
805 _mm256_storeu_pd(fptr, t1);
806 _mm256_storeu_pd(fshiftptr, t2);
812 static gmx_inline void gmx_simdcall
813 gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
814 __m256d fix2, __m256d fiy2, __m256d fiz2,
815 __m256d fix3, __m256d fiy3, __m256d fiz3,
816 double * gmx_restrict fptr,
817 double * gmx_restrict fshiftptr)
819 __m256d t1, t2, t3, t4;
820 __m128d tz3, tA, tB, tC, tD;
822 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
823 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
824 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
825 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
826 fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); /* 0 Z3c-d | 0 Z3a-b */
828 /* Add across the two lanes by swapping and adding back */
829 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
830 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
831 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
833 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
834 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
835 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
837 tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); /* 0 z3 */
839 t2 = _mm256_loadu_pd(fptr);
840 t4 = _mm256_loadu_pd(fptr+4);
841 tA = _mm_load_sd(fptr+8);
843 t2 = _mm256_add_pd(t2, t1);
844 t4 = _mm256_add_pd(t4, t3);
845 tA = _mm_add_sd(tA, tz3);
847 _mm256_storeu_pd(fptr, t2);
848 _mm256_storeu_pd(fptr+4, t4);
849 _mm_store_sd(fptr+8, tA);
851 /* Add up shift force */
852 /* t1: x2 z1 | y1 x1 */
853 /* t3: y3 x3 | z2 y2 */
857 tB = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
858 tC = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
859 tz3 = _mm_add_sd(tz3, tB); /* 0 z1+z3 */
860 tD = _mm_permute_pd(_mm256_castpd256_pd128(t3), _GMX_MM_PERMUTE128D(1, 1));
861 tz3 = _mm_add_sd(tz3, tD); /* - z */
863 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
865 tD = _mm_shuffle_pd(tB, _mm256_castpd256_pd128(t3), _MM_SHUFFLE2(0, 1)); /* y2 x2 */
866 tC = _mm_add_pd(tC, tD); /* y x */
868 tA = _mm_loadu_pd(fshiftptr);
869 tB = _mm_load_sd(fshiftptr+2);
870 tA = _mm_add_pd(tA, tC);
871 tB = _mm_add_sd(tB, tz3);
872 _mm_storeu_pd(fshiftptr, tA);
873 _mm_store_sd(fshiftptr+2, tB);
877 static gmx_inline void gmx_simdcall
878 gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
879 __m256d fix2, __m256d fiy2, __m256d fiz2,
880 __m256d fix3, __m256d fiy3, __m256d fiz3,
881 __m256d fix4, __m256d fiy4, __m256d fiz4,
882 double * gmx_restrict fptr,
883 double * gmx_restrict fshiftptr)
885 __m256d t1, t2, t3, t4, t5, t6;
886 __m128d tA, tB, tC, tD;
888 fix1 = _mm256_hadd_pd(fix1, fiy1); /* Y1c-d X1c-d | Y1a-b X1a-b */
889 fiz1 = _mm256_hadd_pd(fiz1, fix2); /* X2c-d Z1c-d | X2a-b Z1a-b */
890 fiy2 = _mm256_hadd_pd(fiy2, fiz2); /* Z2c-d Y2c-d | Z2a-b Y2a-b */
891 fix3 = _mm256_hadd_pd(fix3, fiy3); /* Y3c-d X3c-d | Y3a-b X3a-b */
892 fiz3 = _mm256_hadd_pd(fiz3, fix4); /* X4c-d Z3c-d | X4a-b Z3a-b */
893 fiy4 = _mm256_hadd_pd(fiy4, fiz4); /* Z4c-d Y4c-d | Z4a-b Y4a-b */
895 /* Add across the two lanes by swapping and adding back */
896 t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); /* X2a-b Z1a-b | Y1a-b X1a-b */
897 t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); /* X2c-d Z1c-d | Y1c-d X1c-d */
898 t1 = _mm256_add_pd(t1, t2); /* x2 z1 | y1 x1 */
900 t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); /* Y3a-b X3a-b | Z2a-b Y2a-b */
901 t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); /* Y3c-d X3c-d | Z2c-d Y2c-d */
902 t3 = _mm256_add_pd(t3, t4); /* y3 x3 | z2 y2 */
904 t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); /* Z4a-b Y4a-b | X4a-b Z3a-b */
905 t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); /* Z4c-d Y4c-d | X4c-d Z3c-d */
906 t5 = _mm256_add_pd(t5, t6); /* z4 y4 | x4 z3 */
908 t2 = _mm256_loadu_pd(fptr);
909 t4 = _mm256_loadu_pd(fptr+4);
910 t6 = _mm256_loadu_pd(fptr+8);
912 t2 = _mm256_add_pd(t2, t1);
913 t4 = _mm256_add_pd(t4, t3);
914 t6 = _mm256_add_pd(t6, t5);
916 _mm256_storeu_pd(fptr, t2);
917 _mm256_storeu_pd(fptr+4, t4);
918 _mm256_storeu_pd(fptr+8, t6);
920 /* Add up shift force */
921 /* t1: x2. z1. | y1. x1. */
922 /* t3: y3. x3. | z2 y2 */
923 /* t5: z4 y4 | x4. z3. */
926 tA = _mm256_extractf128_pd(t1, 0x1); /* x2 z1 */
927 tB = _mm256_extractf128_pd(t3, 0x1); /* y3 x3 */
928 tC = _mm256_extractf128_pd(t5, 0x1); /* z4 y4 */
930 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(t1)); /* y1+y3 x1+x3 */
931 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t5)); /* x2+x4 z1+z3 */
932 tC = _mm_add_pd(tC, _mm256_castpd256_pd128(t3)); /* z4+z2 y4+y2 */
934 tD = _mm_shuffle_pd(tA, tC, _MM_SHUFFLE2(0, 1)); /* y4+y2 x2+x4 */
935 tB = _mm_add_pd(tB, tD); /* y x */
936 tC = _mm_permute_pd(tC, _GMX_MM_PERMUTE128D(1, 1)); /* - z4+z2 */
937 tC = _mm_add_sd(tC, tA); /* - z */
939 tA = _mm_loadu_pd(fshiftptr);
940 tD = _mm_load_sd(fshiftptr+2);
941 tA = _mm_add_pd(tA, tB);
942 tD = _mm_add_sd(tD, tC);
943 _mm_storeu_pd(fshiftptr, tA);
944 _mm_store_sd(fshiftptr+2, tD);
948 static gmx_inline void gmx_simdcall
949 gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
953 pot1 = _mm256_hadd_pd(pot1, pot1);
955 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
957 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
960 static gmx_inline void gmx_simdcall
961 gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
962 __m256d pot2, double * gmx_restrict ptrB)
966 pot1 = _mm256_hadd_pd(pot1, pot2);
968 t1 = _mm_add_pd(_mm256_castpd256_pd128(pot1), _mm256_extractf128_pd(pot1, 0x1));
970 t2 = _mm_permute_pd(t1, _GMX_MM_PERMUTE128D(1, 1));
971 _mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
972 _mm_store_sd(ptrB, _mm_add_sd(_mm_load_sd(ptrB), t2));
976 #endif /* _kernelutil_x86_avx_256_double_h_ */