2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_128_fma_double_h_
36 #define _kernelutil_x86_avx_128_fma_double_h_
40 #include <immintrin.h>
44 # include <x86intrin.h>
49 #define gmx_mm_castsi128_pd _mm_castsi128_pd
50 #define gmx_mm_extract_epi32 _mm_extract_epi32
52 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
53 __m128d __gmx_t1 = row0; \
54 row0 = _mm_unpacklo_pd(row0, row1); \
55 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
58 static gmx_inline int gmx_simdcall
59 gmx_mm_any_lt(__m128d a, __m128d b)
61 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
65 static gmx_inline __m128d gmx_simdcall
66 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
68 return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
71 /* Normal sum of four ymm registers */
72 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
76 /* Load a double value from 1-2 places, merge into xmm register */
79 static gmx_inline __m128d gmx_simdcall
80 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
81 const double * gmx_restrict ptrB)
83 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
86 static gmx_inline __m128d gmx_simdcall
87 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
89 return _mm_load_sd(ptrA);
93 static gmx_inline void gmx_simdcall
94 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
95 double * gmx_restrict ptrB,
100 t2 = _mm_unpackhi_pd(xmm1, xmm1);
101 _mm_store_sd(ptrA, xmm1);
102 _mm_store_sd(ptrB, t2);
105 static gmx_inline void gmx_simdcall
106 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
108 _mm_store_sd(ptrA, xmm1);
112 /* Similar to store, but increments value in memory */
113 static gmx_inline void gmx_simdcall
114 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
115 double * gmx_restrict ptrB, __m128d xmm1)
119 t1 = _mm_unpackhi_pd(xmm1, xmm1);
120 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
121 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
122 _mm_store_sd(ptrA, xmm1);
123 _mm_store_sd(ptrB, t1);
126 static gmx_inline void gmx_simdcall
127 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
131 tmp = gmx_mm_load_1real_pd(ptrA);
132 tmp = _mm_add_sd(tmp, xmm1);
133 gmx_mm_store_1real_pd(ptrA, tmp);
138 static gmx_inline void gmx_simdcall
139 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
140 const double * gmx_restrict p2,
141 __m128d * gmx_restrict c6,
142 __m128d * gmx_restrict c12)
146 /* The c6/c12 array should be aligned */
147 t1 = _mm_loadu_pd(p1);
148 t2 = _mm_loadu_pd(p2);
149 *c6 = _mm_unpacklo_pd(t1, t2);
150 *c12 = _mm_unpackhi_pd(t1, t2);
153 static gmx_inline void gmx_simdcall
154 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
155 __m128d * gmx_restrict c6,
156 __m128d * gmx_restrict c12)
158 *c6 = _mm_load_sd(p1);
159 *c12 = _mm_load_sd(p1+1);
163 static gmx_inline void gmx_simdcall
164 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
165 const double * gmx_restrict xyz,
166 __m128d * gmx_restrict x1,
167 __m128d * gmx_restrict y1,
168 __m128d * gmx_restrict z1)
170 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
172 mem_xy = _mm_loadu_pd(xyz);
173 mem_z = _mm_load_sd(xyz+2);
174 mem_sxy = _mm_loadu_pd(xyz_shift);
175 mem_sz = _mm_load_sd(xyz_shift+2);
177 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
178 mem_z = _mm_add_pd(mem_z, mem_sz);
180 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
181 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
182 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
186 static gmx_inline void gmx_simdcall
187 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
188 const double * gmx_restrict xyz,
189 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
190 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
191 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
193 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
195 t1 = _mm_loadu_pd(xyz);
196 t2 = _mm_loadu_pd(xyz+2);
197 t3 = _mm_loadu_pd(xyz+4);
198 t4 = _mm_loadu_pd(xyz+6);
199 t5 = _mm_load_sd(xyz+8);
201 sxy = _mm_loadu_pd(xyz_shift);
202 sz = _mm_load_sd(xyz_shift+2);
203 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
204 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
206 t1 = _mm_add_pd(t1, sxy);
207 t2 = _mm_add_pd(t2, szx);
208 t3 = _mm_add_pd(t3, syz);
209 t4 = _mm_add_pd(t4, sxy);
210 t5 = _mm_add_sd(t5, sz);
212 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
213 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
214 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
215 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
216 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
217 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
218 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
219 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
220 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
224 static gmx_inline void gmx_simdcall
225 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
226 const double * gmx_restrict xyz,
227 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
228 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
229 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
230 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
232 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
234 t1 = _mm_loadu_pd(xyz);
235 t2 = _mm_loadu_pd(xyz+2);
236 t3 = _mm_loadu_pd(xyz+4);
237 t4 = _mm_loadu_pd(xyz+6);
238 t5 = _mm_loadu_pd(xyz+8);
239 t6 = _mm_loadu_pd(xyz+10);
241 sxy = _mm_loadu_pd(xyz_shift);
242 sz = _mm_load_sd(xyz_shift+2);
243 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
244 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
246 t1 = _mm_add_pd(t1, sxy);
247 t2 = _mm_add_pd(t2, szx);
248 t3 = _mm_add_pd(t3, syz);
249 t4 = _mm_add_pd(t4, sxy);
250 t5 = _mm_add_pd(t5, szx);
251 t6 = _mm_add_pd(t6, syz);
253 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
254 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
255 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
256 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
257 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
258 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
259 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
260 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
261 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
262 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
263 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
264 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
269 static gmx_inline void gmx_simdcall
270 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
271 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
273 *x = _mm_load_sd(p1);
274 *y = _mm_load_sd(p1+1);
275 *z = _mm_load_sd(p1+2);
278 static gmx_inline void gmx_simdcall
279 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
280 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
281 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
282 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
284 *x1 = _mm_load_sd(p1);
285 *y1 = _mm_load_sd(p1+1);
286 *z1 = _mm_load_sd(p1+2);
287 *x2 = _mm_load_sd(p1+3);
288 *y2 = _mm_load_sd(p1+4);
289 *z2 = _mm_load_sd(p1+5);
290 *x3 = _mm_load_sd(p1+6);
291 *y3 = _mm_load_sd(p1+7);
292 *z3 = _mm_load_sd(p1+8);
295 static gmx_inline void gmx_simdcall
296 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
297 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
298 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
299 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
300 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
302 *x1 = _mm_load_sd(p1);
303 *y1 = _mm_load_sd(p1+1);
304 *z1 = _mm_load_sd(p1+2);
305 *x2 = _mm_load_sd(p1+3);
306 *y2 = _mm_load_sd(p1+4);
307 *z2 = _mm_load_sd(p1+5);
308 *x3 = _mm_load_sd(p1+6);
309 *y3 = _mm_load_sd(p1+7);
310 *z3 = _mm_load_sd(p1+8);
311 *x4 = _mm_load_sd(p1+9);
312 *y4 = _mm_load_sd(p1+10);
313 *z4 = _mm_load_sd(p1+11);
317 static gmx_inline void gmx_simdcall
318 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
319 const double * gmx_restrict ptrB,
320 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
322 __m128d t1, t2, t3, t4;
323 t1 = _mm_loadu_pd(ptrA);
324 t2 = _mm_loadu_pd(ptrB);
325 t3 = _mm_load_sd(ptrA+2);
326 t4 = _mm_load_sd(ptrB+2);
327 GMX_MM_TRANSPOSE2_PD(t1, t2);
330 *z1 = _mm_unpacklo_pd(t3, t4);
333 static gmx_inline void gmx_simdcall
334 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
335 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
336 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
337 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
339 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
340 t1 = _mm_loadu_pd(ptrA);
341 t2 = _mm_loadu_pd(ptrB);
342 t3 = _mm_loadu_pd(ptrA+2);
343 t4 = _mm_loadu_pd(ptrB+2);
344 t5 = _mm_loadu_pd(ptrA+4);
345 t6 = _mm_loadu_pd(ptrB+4);
346 t7 = _mm_loadu_pd(ptrA+6);
347 t8 = _mm_loadu_pd(ptrB+6);
348 t9 = _mm_load_sd(ptrA+8);
349 t10 = _mm_load_sd(ptrB+8);
350 GMX_MM_TRANSPOSE2_PD(t1, t2);
351 GMX_MM_TRANSPOSE2_PD(t3, t4);
352 GMX_MM_TRANSPOSE2_PD(t5, t6);
353 GMX_MM_TRANSPOSE2_PD(t7, t8);
362 *z3 = _mm_unpacklo_pd(t9, t10);
366 static gmx_inline void gmx_simdcall
367 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
368 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
369 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
370 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
371 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
373 __m128d t1, t2, t3, t4, t5, t6;
374 t1 = _mm_loadu_pd(ptrA);
375 t2 = _mm_loadu_pd(ptrB);
376 t3 = _mm_loadu_pd(ptrA+2);
377 t4 = _mm_loadu_pd(ptrB+2);
378 t5 = _mm_loadu_pd(ptrA+4);
379 t6 = _mm_loadu_pd(ptrB+4);
380 GMX_MM_TRANSPOSE2_PD(t1, t2);
381 GMX_MM_TRANSPOSE2_PD(t3, t4);
382 GMX_MM_TRANSPOSE2_PD(t5, t6);
389 t1 = _mm_loadu_pd(ptrA+6);
390 t2 = _mm_loadu_pd(ptrB+6);
391 t3 = _mm_loadu_pd(ptrA+8);
392 t4 = _mm_loadu_pd(ptrB+8);
393 t5 = _mm_loadu_pd(ptrA+10);
394 t6 = _mm_loadu_pd(ptrB+10);
395 GMX_MM_TRANSPOSE2_PD(t1, t2);
396 GMX_MM_TRANSPOSE2_PD(t3, t4);
397 GMX_MM_TRANSPOSE2_PD(t5, t6);
407 /* Routines to decrement rvec in memory, typically use for j particle force updates */
408 static gmx_inline void gmx_simdcall
409 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
410 __m128d x1, __m128d y1, __m128d z1)
414 t1 = _mm_load_sd(ptrA);
415 t2 = _mm_load_sd(ptrA+1);
416 t3 = _mm_load_sd(ptrA+2);
418 t1 = _mm_sub_sd(t1, x1);
419 t2 = _mm_sub_sd(t2, y1);
420 t3 = _mm_sub_sd(t3, z1);
421 _mm_store_sd(ptrA, t1);
422 _mm_store_sd(ptrA+1, t2);
423 _mm_store_sd(ptrA+2, t3);
428 static gmx_inline void gmx_simdcall
429 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
430 __m128d x1, __m128d y1, __m128d z1,
431 __m128d x2, __m128d y2, __m128d z2,
432 __m128d x3, __m128d y3, __m128d z3)
434 __m128d t1, t2, t3, t4, t5;
436 t1 = _mm_loadu_pd(ptrA);
437 t2 = _mm_loadu_pd(ptrA+2);
438 t3 = _mm_loadu_pd(ptrA+4);
439 t4 = _mm_loadu_pd(ptrA+6);
440 t5 = _mm_load_sd(ptrA+8);
442 x1 = _mm_unpacklo_pd(x1, y1);
443 z1 = _mm_unpacklo_pd(z1, x2);
444 y2 = _mm_unpacklo_pd(y2, z2);
445 x3 = _mm_unpacklo_pd(x3, y3);
446 /* nothing to be done for z3 */
448 t1 = _mm_sub_pd(t1, x1);
449 t2 = _mm_sub_pd(t2, z1);
450 t3 = _mm_sub_pd(t3, y2);
451 t4 = _mm_sub_pd(t4, x3);
452 t5 = _mm_sub_sd(t5, z3);
453 _mm_storeu_pd(ptrA, t1);
454 _mm_storeu_pd(ptrA+2, t2);
455 _mm_storeu_pd(ptrA+4, t3);
456 _mm_storeu_pd(ptrA+6, t4);
457 _mm_store_sd(ptrA+8, t5);
461 static gmx_inline void gmx_simdcall
462 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
463 __m128d x1, __m128d y1, __m128d z1,
464 __m128d x2, __m128d y2, __m128d z2,
465 __m128d x3, __m128d y3, __m128d z3,
466 __m128d x4, __m128d y4, __m128d z4)
468 __m128d t1, t2, t3, t4, t5, t6;
470 t1 = _mm_loadu_pd(ptrA);
471 t2 = _mm_loadu_pd(ptrA+2);
472 t3 = _mm_loadu_pd(ptrA+4);
473 t4 = _mm_loadu_pd(ptrA+6);
474 t5 = _mm_loadu_pd(ptrA+8);
475 t6 = _mm_loadu_pd(ptrA+10);
477 x1 = _mm_unpacklo_pd(x1, y1);
478 z1 = _mm_unpacklo_pd(z1, x2);
479 y2 = _mm_unpacklo_pd(y2, z2);
480 x3 = _mm_unpacklo_pd(x3, y3);
481 z3 = _mm_unpacklo_pd(z3, x4);
482 y4 = _mm_unpacklo_pd(y4, z4);
484 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
485 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
486 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
487 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
488 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
489 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
493 static gmx_inline void gmx_simdcall
494 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
495 __m128d x1, __m128d y1, __m128d z1)
497 __m128d t1, t2, t3, t4, t5, t6, t7;
499 t1 = _mm_loadu_pd(ptrA);
500 t2 = _mm_load_sd(ptrA+2);
501 t3 = _mm_loadu_pd(ptrB);
502 t4 = _mm_load_sd(ptrB+2);
504 t5 = _mm_unpacklo_pd(x1, y1);
505 t6 = _mm_unpackhi_pd(x1, y1);
506 t7 = _mm_unpackhi_pd(z1, z1);
508 t1 = _mm_sub_pd(t1, t5);
509 t2 = _mm_sub_sd(t2, z1);
511 t3 = _mm_sub_pd(t3, t6);
512 t4 = _mm_sub_sd(t4, t7);
514 _mm_storeu_pd(ptrA, t1);
515 _mm_store_sd(ptrA+2, t2);
516 _mm_storeu_pd(ptrB, t3);
517 _mm_store_sd(ptrB+2, t4);
522 static gmx_inline void gmx_simdcall
523 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
524 __m128d x1, __m128d y1, __m128d z1,
525 __m128d x2, __m128d y2, __m128d z2,
526 __m128d x3, __m128d y3, __m128d z3)
528 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
529 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
531 t1 = _mm_loadu_pd(ptrA);
532 t2 = _mm_loadu_pd(ptrA+2);
533 t3 = _mm_loadu_pd(ptrA+4);
534 t4 = _mm_loadu_pd(ptrA+6);
535 t5 = _mm_load_sd(ptrA+8);
536 t6 = _mm_loadu_pd(ptrB);
537 t7 = _mm_loadu_pd(ptrB+2);
538 t8 = _mm_loadu_pd(ptrB+4);
539 t9 = _mm_loadu_pd(ptrB+6);
540 t10 = _mm_load_sd(ptrB+8);
542 tA = _mm_unpacklo_pd(x1, y1);
543 tB = _mm_unpackhi_pd(x1, y1);
544 tC = _mm_unpacklo_pd(z1, x2);
545 tD = _mm_unpackhi_pd(z1, x2);
546 tE = _mm_unpacklo_pd(y2, z2);
547 tF = _mm_unpackhi_pd(y2, z2);
548 tG = _mm_unpacklo_pd(x3, y3);
549 tH = _mm_unpackhi_pd(x3, y3);
550 tI = _mm_unpackhi_pd(z3, z3);
552 t1 = _mm_sub_pd(t1, tA);
553 t2 = _mm_sub_pd(t2, tC);
554 t3 = _mm_sub_pd(t3, tE);
555 t4 = _mm_sub_pd(t4, tG);
556 t5 = _mm_sub_sd(t5, z3);
558 t6 = _mm_sub_pd(t6, tB);
559 t7 = _mm_sub_pd(t7, tD);
560 t8 = _mm_sub_pd(t8, tF);
561 t9 = _mm_sub_pd(t9, tH);
562 t10 = _mm_sub_sd(t10, tI);
564 _mm_storeu_pd(ptrA, t1);
565 _mm_storeu_pd(ptrA+2, t2);
566 _mm_storeu_pd(ptrA+4, t3);
567 _mm_storeu_pd(ptrA+6, t4);
568 _mm_store_sd(ptrA+8, t5);
569 _mm_storeu_pd(ptrB, t6);
570 _mm_storeu_pd(ptrB+2, t7);
571 _mm_storeu_pd(ptrB+4, t8);
572 _mm_storeu_pd(ptrB+6, t9);
573 _mm_store_sd(ptrB+8, t10);
577 static gmx_inline void gmx_simdcall
578 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
579 __m128d x1, __m128d y1, __m128d z1,
580 __m128d x2, __m128d y2, __m128d z2,
581 __m128d x3, __m128d y3, __m128d z3,
582 __m128d x4, __m128d y4, __m128d z4)
584 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
585 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
587 t1 = _mm_loadu_pd(ptrA);
588 t2 = _mm_loadu_pd(ptrA+2);
589 t3 = _mm_loadu_pd(ptrA+4);
590 t4 = _mm_loadu_pd(ptrA+6);
591 t5 = _mm_loadu_pd(ptrA+8);
592 t6 = _mm_loadu_pd(ptrA+10);
593 t7 = _mm_loadu_pd(ptrB);
594 t8 = _mm_loadu_pd(ptrB+2);
595 t9 = _mm_loadu_pd(ptrB+4);
596 t10 = _mm_loadu_pd(ptrB+6);
597 t11 = _mm_loadu_pd(ptrB+8);
598 t12 = _mm_loadu_pd(ptrB+10);
600 tA = _mm_unpacklo_pd(x1, y1);
601 tB = _mm_unpackhi_pd(x1, y1);
602 tC = _mm_unpacklo_pd(z1, x2);
603 tD = _mm_unpackhi_pd(z1, x2);
604 tE = _mm_unpacklo_pd(y2, z2);
605 tF = _mm_unpackhi_pd(y2, z2);
606 tG = _mm_unpacklo_pd(x3, y3);
607 tH = _mm_unpackhi_pd(x3, y3);
608 tI = _mm_unpacklo_pd(z3, x4);
609 tJ = _mm_unpackhi_pd(z3, x4);
610 tK = _mm_unpacklo_pd(y4, z4);
611 tL = _mm_unpackhi_pd(y4, z4);
613 t1 = _mm_sub_pd(t1, tA);
614 t2 = _mm_sub_pd(t2, tC);
615 t3 = _mm_sub_pd(t3, tE);
616 t4 = _mm_sub_pd(t4, tG);
617 t5 = _mm_sub_pd(t5, tI);
618 t6 = _mm_sub_pd(t6, tK);
620 t7 = _mm_sub_pd(t7, tB);
621 t8 = _mm_sub_pd(t8, tD);
622 t9 = _mm_sub_pd(t9, tF);
623 t10 = _mm_sub_pd(t10, tH);
624 t11 = _mm_sub_pd(t11, tJ);
625 t12 = _mm_sub_pd(t12, tL);
627 _mm_storeu_pd(ptrA, t1);
628 _mm_storeu_pd(ptrA+2, t2);
629 _mm_storeu_pd(ptrA+4, t3);
630 _mm_storeu_pd(ptrA+6, t4);
631 _mm_storeu_pd(ptrA+8, t5);
632 _mm_storeu_pd(ptrA+10, t6);
633 _mm_storeu_pd(ptrB, t7);
634 _mm_storeu_pd(ptrB+2, t8);
635 _mm_storeu_pd(ptrB+4, t9);
636 _mm_storeu_pd(ptrB+6, t10);
637 _mm_storeu_pd(ptrB+8, t11);
638 _mm_storeu_pd(ptrB+10, t12);
642 static gmx_inline void gmx_simdcall
643 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
644 double * gmx_restrict fptr,
645 double * gmx_restrict fshiftptr)
647 fix1 = _mm_hadd_pd(fix1, fiy1);
648 fiz1 = _mm_hadd_pd(fiz1, fiz1);
650 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
651 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
653 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
654 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
658 static gmx_inline void gmx_simdcall
659 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
660 __m128d fix2, __m128d fiy2, __m128d fiz2,
661 __m128d fix3, __m128d fiy3, __m128d fiz3,
662 double * gmx_restrict fptr,
663 double * gmx_restrict fshiftptr)
667 fix1 = _mm_hadd_pd(fix1, fiy1);
668 fiz1 = _mm_hadd_pd(fiz1, fix2);
669 fiy2 = _mm_hadd_pd(fiy2, fiz2);
670 fix3 = _mm_hadd_pd(fix3, fiy3);
671 fiz3 = _mm_hadd_pd(fiz3, fiz3);
673 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
674 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
675 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
676 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
677 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
679 fix1 = _mm_add_pd(fix1, fix3);
680 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
681 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
683 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
684 fiz1 = _mm_add_sd(fiz1, fiz3);
685 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
687 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
688 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
692 static gmx_inline void gmx_simdcall
693 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
694 __m128d fix2, __m128d fiy2, __m128d fiz2,
695 __m128d fix3, __m128d fiy3, __m128d fiz3,
696 __m128d fix4, __m128d fiy4, __m128d fiz4,
697 double * gmx_restrict fptr,
698 double * gmx_restrict fshiftptr)
702 fix1 = _mm_hadd_pd(fix1, fiy1);
703 fiz1 = _mm_hadd_pd(fiz1, fix2);
704 fiy2 = _mm_hadd_pd(fiy2, fiz2);
705 fix3 = _mm_hadd_pd(fix3, fiy3);
706 fiz3 = _mm_hadd_pd(fiz3, fix4);
707 fiy4 = _mm_hadd_pd(fiy4, fiz4);
709 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
710 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
711 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
712 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
713 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
714 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
716 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
717 fix1 = _mm_add_pd(fix1, t1);
718 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
719 fix3 = _mm_add_pd(fix3, t2);
720 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
722 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
723 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
724 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
726 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
727 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
731 static gmx_inline void gmx_simdcall
732 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
734 pot1 = _mm_hadd_pd(pot1, pot1);
735 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
738 static gmx_inline void gmx_simdcall
739 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
740 __m128d pot2, double * gmx_restrict ptrB)
742 pot1 = _mm_hadd_pd(pot1, pot2);
743 pot2 = _mm_unpackhi_pd(pot1, pot1);
745 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
746 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
750 #endif /* _kernelutil_x86_avx_128_fma_double_h_ */