2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_128_fma_double_h_
36 #define _kernelutil_x86_avx_128_fma_double_h_
39 #include <immintrin.h>
43 # include <x86intrin.h>
46 #define gmx_mm_castsi128_pd _mm_castsi128_pd
47 #define gmx_mm_extract_epi32 _mm_extract_epi32
49 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
50 __m128d __gmx_t1 = row0; \
51 row0 = _mm_unpacklo_pd(row0, row1); \
52 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
55 static gmx_inline int gmx_simdcall
56 gmx_mm_any_lt(__m128d a, __m128d b)
58 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
62 static gmx_inline __m128d gmx_simdcall
63 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
65 return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
68 /* Normal sum of four ymm registers */
69 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
73 /* Load a double value from 1-2 places, merge into xmm register */
76 static gmx_inline __m128d gmx_simdcall
77 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
78 const double * gmx_restrict ptrB)
80 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
83 static gmx_inline __m128d gmx_simdcall
84 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
86 return _mm_load_sd(ptrA);
90 static gmx_inline void gmx_simdcall
91 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
92 double * gmx_restrict ptrB,
97 t2 = _mm_unpackhi_pd(xmm1, xmm1);
98 _mm_store_sd(ptrA, xmm1);
99 _mm_store_sd(ptrB, t2);
102 static gmx_inline void gmx_simdcall
103 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
105 _mm_store_sd(ptrA, xmm1);
109 /* Similar to store, but increments value in memory */
110 static gmx_inline void gmx_simdcall
111 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
112 double * gmx_restrict ptrB, __m128d xmm1)
116 t1 = _mm_unpackhi_pd(xmm1, xmm1);
117 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
118 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
119 _mm_store_sd(ptrA, xmm1);
120 _mm_store_sd(ptrB, t1);
123 static gmx_inline void gmx_simdcall
124 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
128 tmp = gmx_mm_load_1real_pd(ptrA);
129 tmp = _mm_add_sd(tmp, xmm1);
130 gmx_mm_store_1real_pd(ptrA, tmp);
135 static gmx_inline void gmx_simdcall
136 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
137 const double * gmx_restrict p2,
138 __m128d * gmx_restrict c6,
139 __m128d * gmx_restrict c12)
143 /* The c6/c12 array should be aligned */
144 t1 = _mm_loadu_pd(p1);
145 t2 = _mm_loadu_pd(p2);
146 *c6 = _mm_unpacklo_pd(t1, t2);
147 *c12 = _mm_unpackhi_pd(t1, t2);
150 static gmx_inline void gmx_simdcall
151 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
152 __m128d * gmx_restrict c6,
153 __m128d * gmx_restrict c12)
155 *c6 = _mm_load_sd(p1);
156 *c12 = _mm_load_sd(p1+1);
160 static gmx_inline void gmx_simdcall
161 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
162 const double * gmx_restrict xyz,
163 __m128d * gmx_restrict x1,
164 __m128d * gmx_restrict y1,
165 __m128d * gmx_restrict z1)
167 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
169 mem_xy = _mm_loadu_pd(xyz);
170 mem_z = _mm_load_sd(xyz+2);
171 mem_sxy = _mm_loadu_pd(xyz_shift);
172 mem_sz = _mm_load_sd(xyz_shift+2);
174 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
175 mem_z = _mm_add_pd(mem_z, mem_sz);
177 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
178 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
179 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
183 static gmx_inline void gmx_simdcall
184 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
185 const double * gmx_restrict xyz,
186 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
187 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
188 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
190 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
192 t1 = _mm_loadu_pd(xyz);
193 t2 = _mm_loadu_pd(xyz+2);
194 t3 = _mm_loadu_pd(xyz+4);
195 t4 = _mm_loadu_pd(xyz+6);
196 t5 = _mm_load_sd(xyz+8);
198 sxy = _mm_loadu_pd(xyz_shift);
199 sz = _mm_load_sd(xyz_shift+2);
200 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
201 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
203 t1 = _mm_add_pd(t1, sxy);
204 t2 = _mm_add_pd(t2, szx);
205 t3 = _mm_add_pd(t3, syz);
206 t4 = _mm_add_pd(t4, sxy);
207 t5 = _mm_add_sd(t5, sz);
209 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
210 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
211 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
212 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
213 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
214 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
215 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
216 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
217 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
221 static gmx_inline void gmx_simdcall
222 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
223 const double * gmx_restrict xyz,
224 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
225 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
226 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
227 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
229 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
231 t1 = _mm_loadu_pd(xyz);
232 t2 = _mm_loadu_pd(xyz+2);
233 t3 = _mm_loadu_pd(xyz+4);
234 t4 = _mm_loadu_pd(xyz+6);
235 t5 = _mm_loadu_pd(xyz+8);
236 t6 = _mm_loadu_pd(xyz+10);
238 sxy = _mm_loadu_pd(xyz_shift);
239 sz = _mm_load_sd(xyz_shift+2);
240 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
241 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
243 t1 = _mm_add_pd(t1, sxy);
244 t2 = _mm_add_pd(t2, szx);
245 t3 = _mm_add_pd(t3, syz);
246 t4 = _mm_add_pd(t4, sxy);
247 t5 = _mm_add_pd(t5, szx);
248 t6 = _mm_add_pd(t6, syz);
250 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
251 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
252 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
253 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
254 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
255 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
256 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
257 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
258 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
259 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
260 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
261 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
266 static gmx_inline void gmx_simdcall
267 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
268 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
270 *x = _mm_load_sd(p1);
271 *y = _mm_load_sd(p1+1);
272 *z = _mm_load_sd(p1+2);
275 static gmx_inline void gmx_simdcall
276 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
277 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
278 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
279 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
281 *x1 = _mm_load_sd(p1);
282 *y1 = _mm_load_sd(p1+1);
283 *z1 = _mm_load_sd(p1+2);
284 *x2 = _mm_load_sd(p1+3);
285 *y2 = _mm_load_sd(p1+4);
286 *z2 = _mm_load_sd(p1+5);
287 *x3 = _mm_load_sd(p1+6);
288 *y3 = _mm_load_sd(p1+7);
289 *z3 = _mm_load_sd(p1+8);
292 static gmx_inline void gmx_simdcall
293 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
294 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
295 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
296 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
297 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
299 *x1 = _mm_load_sd(p1);
300 *y1 = _mm_load_sd(p1+1);
301 *z1 = _mm_load_sd(p1+2);
302 *x2 = _mm_load_sd(p1+3);
303 *y2 = _mm_load_sd(p1+4);
304 *z2 = _mm_load_sd(p1+5);
305 *x3 = _mm_load_sd(p1+6);
306 *y3 = _mm_load_sd(p1+7);
307 *z3 = _mm_load_sd(p1+8);
308 *x4 = _mm_load_sd(p1+9);
309 *y4 = _mm_load_sd(p1+10);
310 *z4 = _mm_load_sd(p1+11);
314 static gmx_inline void gmx_simdcall
315 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
316 const double * gmx_restrict ptrB,
317 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
319 __m128d t1, t2, t3, t4;
320 t1 = _mm_loadu_pd(ptrA);
321 t2 = _mm_loadu_pd(ptrB);
322 t3 = _mm_load_sd(ptrA+2);
323 t4 = _mm_load_sd(ptrB+2);
324 GMX_MM_TRANSPOSE2_PD(t1, t2);
327 *z1 = _mm_unpacklo_pd(t3, t4);
330 static gmx_inline void gmx_simdcall
331 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
332 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
333 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
334 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
336 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
337 t1 = _mm_loadu_pd(ptrA);
338 t2 = _mm_loadu_pd(ptrB);
339 t3 = _mm_loadu_pd(ptrA+2);
340 t4 = _mm_loadu_pd(ptrB+2);
341 t5 = _mm_loadu_pd(ptrA+4);
342 t6 = _mm_loadu_pd(ptrB+4);
343 t7 = _mm_loadu_pd(ptrA+6);
344 t8 = _mm_loadu_pd(ptrB+6);
345 t9 = _mm_load_sd(ptrA+8);
346 t10 = _mm_load_sd(ptrB+8);
347 GMX_MM_TRANSPOSE2_PD(t1, t2);
348 GMX_MM_TRANSPOSE2_PD(t3, t4);
349 GMX_MM_TRANSPOSE2_PD(t5, t6);
350 GMX_MM_TRANSPOSE2_PD(t7, t8);
359 *z3 = _mm_unpacklo_pd(t9, t10);
363 static gmx_inline void gmx_simdcall
364 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
365 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
366 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
367 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
368 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
370 __m128d t1, t2, t3, t4, t5, t6;
371 t1 = _mm_loadu_pd(ptrA);
372 t2 = _mm_loadu_pd(ptrB);
373 t3 = _mm_loadu_pd(ptrA+2);
374 t4 = _mm_loadu_pd(ptrB+2);
375 t5 = _mm_loadu_pd(ptrA+4);
376 t6 = _mm_loadu_pd(ptrB+4);
377 GMX_MM_TRANSPOSE2_PD(t1, t2);
378 GMX_MM_TRANSPOSE2_PD(t3, t4);
379 GMX_MM_TRANSPOSE2_PD(t5, t6);
386 t1 = _mm_loadu_pd(ptrA+6);
387 t2 = _mm_loadu_pd(ptrB+6);
388 t3 = _mm_loadu_pd(ptrA+8);
389 t4 = _mm_loadu_pd(ptrB+8);
390 t5 = _mm_loadu_pd(ptrA+10);
391 t6 = _mm_loadu_pd(ptrB+10);
392 GMX_MM_TRANSPOSE2_PD(t1, t2);
393 GMX_MM_TRANSPOSE2_PD(t3, t4);
394 GMX_MM_TRANSPOSE2_PD(t5, t6);
404 /* Routines to decrement rvec in memory, typically use for j particle force updates */
405 static gmx_inline void gmx_simdcall
406 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
407 __m128d x1, __m128d y1, __m128d z1)
411 t1 = _mm_load_sd(ptrA);
412 t2 = _mm_load_sd(ptrA+1);
413 t3 = _mm_load_sd(ptrA+2);
415 t1 = _mm_sub_sd(t1, x1);
416 t2 = _mm_sub_sd(t2, y1);
417 t3 = _mm_sub_sd(t3, z1);
418 _mm_store_sd(ptrA, t1);
419 _mm_store_sd(ptrA+1, t2);
420 _mm_store_sd(ptrA+2, t3);
425 static gmx_inline void gmx_simdcall
426 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
427 __m128d x1, __m128d y1, __m128d z1,
428 __m128d x2, __m128d y2, __m128d z2,
429 __m128d x3, __m128d y3, __m128d z3)
431 __m128d t1, t2, t3, t4, t5;
433 t1 = _mm_loadu_pd(ptrA);
434 t2 = _mm_loadu_pd(ptrA+2);
435 t3 = _mm_loadu_pd(ptrA+4);
436 t4 = _mm_loadu_pd(ptrA+6);
437 t5 = _mm_load_sd(ptrA+8);
439 x1 = _mm_unpacklo_pd(x1, y1);
440 z1 = _mm_unpacklo_pd(z1, x2);
441 y2 = _mm_unpacklo_pd(y2, z2);
442 x3 = _mm_unpacklo_pd(x3, y3);
443 /* nothing to be done for z3 */
445 t1 = _mm_sub_pd(t1, x1);
446 t2 = _mm_sub_pd(t2, z1);
447 t3 = _mm_sub_pd(t3, y2);
448 t4 = _mm_sub_pd(t4, x3);
449 t5 = _mm_sub_sd(t5, z3);
450 _mm_storeu_pd(ptrA, t1);
451 _mm_storeu_pd(ptrA+2, t2);
452 _mm_storeu_pd(ptrA+4, t3);
453 _mm_storeu_pd(ptrA+6, t4);
454 _mm_store_sd(ptrA+8, t5);
458 static gmx_inline void gmx_simdcall
459 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
460 __m128d x1, __m128d y1, __m128d z1,
461 __m128d x2, __m128d y2, __m128d z2,
462 __m128d x3, __m128d y3, __m128d z3,
463 __m128d x4, __m128d y4, __m128d z4)
465 __m128d t1, t2, t3, t4, t5, t6;
467 t1 = _mm_loadu_pd(ptrA);
468 t2 = _mm_loadu_pd(ptrA+2);
469 t3 = _mm_loadu_pd(ptrA+4);
470 t4 = _mm_loadu_pd(ptrA+6);
471 t5 = _mm_loadu_pd(ptrA+8);
472 t6 = _mm_loadu_pd(ptrA+10);
474 x1 = _mm_unpacklo_pd(x1, y1);
475 z1 = _mm_unpacklo_pd(z1, x2);
476 y2 = _mm_unpacklo_pd(y2, z2);
477 x3 = _mm_unpacklo_pd(x3, y3);
478 z3 = _mm_unpacklo_pd(z3, x4);
479 y4 = _mm_unpacklo_pd(y4, z4);
481 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
482 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
483 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
484 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
485 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
486 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
490 static gmx_inline void gmx_simdcall
491 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
492 __m128d x1, __m128d y1, __m128d z1)
494 __m128d t1, t2, t3, t4, t5, t6, t7;
496 t1 = _mm_loadu_pd(ptrA);
497 t2 = _mm_load_sd(ptrA+2);
498 t3 = _mm_loadu_pd(ptrB);
499 t4 = _mm_load_sd(ptrB+2);
501 t5 = _mm_unpacklo_pd(x1, y1);
502 t6 = _mm_unpackhi_pd(x1, y1);
503 t7 = _mm_unpackhi_pd(z1, z1);
505 t1 = _mm_sub_pd(t1, t5);
506 t2 = _mm_sub_sd(t2, z1);
508 t3 = _mm_sub_pd(t3, t6);
509 t4 = _mm_sub_sd(t4, t7);
511 _mm_storeu_pd(ptrA, t1);
512 _mm_store_sd(ptrA+2, t2);
513 _mm_storeu_pd(ptrB, t3);
514 _mm_store_sd(ptrB+2, t4);
519 static gmx_inline void gmx_simdcall
520 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
521 __m128d x1, __m128d y1, __m128d z1,
522 __m128d x2, __m128d y2, __m128d z2,
523 __m128d x3, __m128d y3, __m128d z3)
525 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
526 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
528 t1 = _mm_loadu_pd(ptrA);
529 t2 = _mm_loadu_pd(ptrA+2);
530 t3 = _mm_loadu_pd(ptrA+4);
531 t4 = _mm_loadu_pd(ptrA+6);
532 t5 = _mm_load_sd(ptrA+8);
533 t6 = _mm_loadu_pd(ptrB);
534 t7 = _mm_loadu_pd(ptrB+2);
535 t8 = _mm_loadu_pd(ptrB+4);
536 t9 = _mm_loadu_pd(ptrB+6);
537 t10 = _mm_load_sd(ptrB+8);
539 tA = _mm_unpacklo_pd(x1, y1);
540 tB = _mm_unpackhi_pd(x1, y1);
541 tC = _mm_unpacklo_pd(z1, x2);
542 tD = _mm_unpackhi_pd(z1, x2);
543 tE = _mm_unpacklo_pd(y2, z2);
544 tF = _mm_unpackhi_pd(y2, z2);
545 tG = _mm_unpacklo_pd(x3, y3);
546 tH = _mm_unpackhi_pd(x3, y3);
547 tI = _mm_unpackhi_pd(z3, z3);
549 t1 = _mm_sub_pd(t1, tA);
550 t2 = _mm_sub_pd(t2, tC);
551 t3 = _mm_sub_pd(t3, tE);
552 t4 = _mm_sub_pd(t4, tG);
553 t5 = _mm_sub_sd(t5, z3);
555 t6 = _mm_sub_pd(t6, tB);
556 t7 = _mm_sub_pd(t7, tD);
557 t8 = _mm_sub_pd(t8, tF);
558 t9 = _mm_sub_pd(t9, tH);
559 t10 = _mm_sub_sd(t10, tI);
561 _mm_storeu_pd(ptrA, t1);
562 _mm_storeu_pd(ptrA+2, t2);
563 _mm_storeu_pd(ptrA+4, t3);
564 _mm_storeu_pd(ptrA+6, t4);
565 _mm_store_sd(ptrA+8, t5);
566 _mm_storeu_pd(ptrB, t6);
567 _mm_storeu_pd(ptrB+2, t7);
568 _mm_storeu_pd(ptrB+4, t8);
569 _mm_storeu_pd(ptrB+6, t9);
570 _mm_store_sd(ptrB+8, t10);
574 static gmx_inline void gmx_simdcall
575 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
576 __m128d x1, __m128d y1, __m128d z1,
577 __m128d x2, __m128d y2, __m128d z2,
578 __m128d x3, __m128d y3, __m128d z3,
579 __m128d x4, __m128d y4, __m128d z4)
581 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
582 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
584 t1 = _mm_loadu_pd(ptrA);
585 t2 = _mm_loadu_pd(ptrA+2);
586 t3 = _mm_loadu_pd(ptrA+4);
587 t4 = _mm_loadu_pd(ptrA+6);
588 t5 = _mm_loadu_pd(ptrA+8);
589 t6 = _mm_loadu_pd(ptrA+10);
590 t7 = _mm_loadu_pd(ptrB);
591 t8 = _mm_loadu_pd(ptrB+2);
592 t9 = _mm_loadu_pd(ptrB+4);
593 t10 = _mm_loadu_pd(ptrB+6);
594 t11 = _mm_loadu_pd(ptrB+8);
595 t12 = _mm_loadu_pd(ptrB+10);
597 tA = _mm_unpacklo_pd(x1, y1);
598 tB = _mm_unpackhi_pd(x1, y1);
599 tC = _mm_unpacklo_pd(z1, x2);
600 tD = _mm_unpackhi_pd(z1, x2);
601 tE = _mm_unpacklo_pd(y2, z2);
602 tF = _mm_unpackhi_pd(y2, z2);
603 tG = _mm_unpacklo_pd(x3, y3);
604 tH = _mm_unpackhi_pd(x3, y3);
605 tI = _mm_unpacklo_pd(z3, x4);
606 tJ = _mm_unpackhi_pd(z3, x4);
607 tK = _mm_unpacklo_pd(y4, z4);
608 tL = _mm_unpackhi_pd(y4, z4);
610 t1 = _mm_sub_pd(t1, tA);
611 t2 = _mm_sub_pd(t2, tC);
612 t3 = _mm_sub_pd(t3, tE);
613 t4 = _mm_sub_pd(t4, tG);
614 t5 = _mm_sub_pd(t5, tI);
615 t6 = _mm_sub_pd(t6, tK);
617 t7 = _mm_sub_pd(t7, tB);
618 t8 = _mm_sub_pd(t8, tD);
619 t9 = _mm_sub_pd(t9, tF);
620 t10 = _mm_sub_pd(t10, tH);
621 t11 = _mm_sub_pd(t11, tJ);
622 t12 = _mm_sub_pd(t12, tL);
624 _mm_storeu_pd(ptrA, t1);
625 _mm_storeu_pd(ptrA+2, t2);
626 _mm_storeu_pd(ptrA+4, t3);
627 _mm_storeu_pd(ptrA+6, t4);
628 _mm_storeu_pd(ptrA+8, t5);
629 _mm_storeu_pd(ptrA+10, t6);
630 _mm_storeu_pd(ptrB, t7);
631 _mm_storeu_pd(ptrB+2, t8);
632 _mm_storeu_pd(ptrB+4, t9);
633 _mm_storeu_pd(ptrB+6, t10);
634 _mm_storeu_pd(ptrB+8, t11);
635 _mm_storeu_pd(ptrB+10, t12);
639 static gmx_inline void gmx_simdcall
640 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
641 double * gmx_restrict fptr,
642 double * gmx_restrict fshiftptr)
644 fix1 = _mm_hadd_pd(fix1, fiy1);
645 fiz1 = _mm_hadd_pd(fiz1, fiz1);
647 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
648 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
650 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
651 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
655 static gmx_inline void gmx_simdcall
656 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
657 __m128d fix2, __m128d fiy2, __m128d fiz2,
658 __m128d fix3, __m128d fiy3, __m128d fiz3,
659 double * gmx_restrict fptr,
660 double * gmx_restrict fshiftptr)
664 fix1 = _mm_hadd_pd(fix1, fiy1);
665 fiz1 = _mm_hadd_pd(fiz1, fix2);
666 fiy2 = _mm_hadd_pd(fiy2, fiz2);
667 fix3 = _mm_hadd_pd(fix3, fiy3);
668 fiz3 = _mm_hadd_pd(fiz3, fiz3);
670 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
671 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
672 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
673 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
674 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
676 fix1 = _mm_add_pd(fix1, fix3);
677 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
678 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
680 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
681 fiz1 = _mm_add_sd(fiz1, fiz3);
682 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
684 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
685 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
689 static gmx_inline void gmx_simdcall
690 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
691 __m128d fix2, __m128d fiy2, __m128d fiz2,
692 __m128d fix3, __m128d fiy3, __m128d fiz3,
693 __m128d fix4, __m128d fiy4, __m128d fiz4,
694 double * gmx_restrict fptr,
695 double * gmx_restrict fshiftptr)
699 fix1 = _mm_hadd_pd(fix1, fiy1);
700 fiz1 = _mm_hadd_pd(fiz1, fix2);
701 fiy2 = _mm_hadd_pd(fiy2, fiz2);
702 fix3 = _mm_hadd_pd(fix3, fiy3);
703 fiz3 = _mm_hadd_pd(fiz3, fix4);
704 fiy4 = _mm_hadd_pd(fiy4, fiz4);
706 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
707 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
708 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
709 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
710 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
711 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
713 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
714 fix1 = _mm_add_pd(fix1, t1);
715 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
716 fix3 = _mm_add_pd(fix3, t2);
717 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
719 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
720 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
721 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
723 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
724 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
728 static gmx_inline void gmx_simdcall
729 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
731 pot1 = _mm_hadd_pd(pot1, pot1);
732 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
735 static gmx_inline void gmx_simdcall
736 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
737 __m128d pot2, double * gmx_restrict ptrB)
739 pot1 = _mm_hadd_pd(pot1, pot2);
740 pot2 = _mm_unpackhi_pd(pot1, pot1);
742 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
743 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
747 #endif /* _kernelutil_x86_avx_128_fma_double_h_ */