2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_128_fma_double_h_
36 #define _kernelutil_x86_avx_128_fma_double_h_
39 #include <immintrin.h>
43 # include <x86intrin.h>
48 #define gmx_mm_castsi128_pd _mm_castsi128_pd
49 #define gmx_mm_extract_epi32 _mm_extract_epi32
51 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
52 __m128d __gmx_t1 = row0; \
53 row0 = _mm_unpacklo_pd(row0, row1); \
54 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
57 static gmx_inline int gmx_simdcall
58 gmx_mm_any_lt(__m128d a, __m128d b)
60 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
64 static gmx_inline __m128d gmx_simdcall
65 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
67 return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
70 /* Normal sum of four ymm registers */
71 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
75 /* Load a double value from 1-2 places, merge into xmm register */
78 static gmx_inline __m128d gmx_simdcall
79 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
80 const double * gmx_restrict ptrB)
82 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
85 static gmx_inline __m128d gmx_simdcall
86 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
88 return _mm_load_sd(ptrA);
92 static gmx_inline void gmx_simdcall
93 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
94 double * gmx_restrict ptrB,
99 t2 = _mm_unpackhi_pd(xmm1, xmm1);
100 _mm_store_sd(ptrA, xmm1);
101 _mm_store_sd(ptrB, t2);
104 static gmx_inline void gmx_simdcall
105 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
107 _mm_store_sd(ptrA, xmm1);
111 /* Similar to store, but increments value in memory */
112 static gmx_inline void gmx_simdcall
113 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
114 double * gmx_restrict ptrB, __m128d xmm1)
118 t1 = _mm_unpackhi_pd(xmm1, xmm1);
119 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
120 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
121 _mm_store_sd(ptrA, xmm1);
122 _mm_store_sd(ptrB, t1);
125 static gmx_inline void gmx_simdcall
126 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
130 tmp = gmx_mm_load_1real_pd(ptrA);
131 tmp = _mm_add_sd(tmp, xmm1);
132 gmx_mm_store_1real_pd(ptrA, tmp);
137 static gmx_inline void gmx_simdcall
138 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
139 const double * gmx_restrict p2,
140 __m128d * gmx_restrict c6,
141 __m128d * gmx_restrict c12)
145 /* The c6/c12 array should be aligned */
146 t1 = _mm_loadu_pd(p1);
147 t2 = _mm_loadu_pd(p2);
148 *c6 = _mm_unpacklo_pd(t1, t2);
149 *c12 = _mm_unpackhi_pd(t1, t2);
152 static gmx_inline void gmx_simdcall
153 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
154 __m128d * gmx_restrict c6,
155 __m128d * gmx_restrict c12)
157 *c6 = _mm_load_sd(p1);
158 *c12 = _mm_load_sd(p1+1);
162 static gmx_inline void gmx_simdcall
163 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
164 const double * gmx_restrict xyz,
165 __m128d * gmx_restrict x1,
166 __m128d * gmx_restrict y1,
167 __m128d * gmx_restrict z1)
169 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
171 mem_xy = _mm_loadu_pd(xyz);
172 mem_z = _mm_load_sd(xyz+2);
173 mem_sxy = _mm_loadu_pd(xyz_shift);
174 mem_sz = _mm_load_sd(xyz_shift+2);
176 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
177 mem_z = _mm_add_pd(mem_z, mem_sz);
179 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
180 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
181 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
185 static gmx_inline void gmx_simdcall
186 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
187 const double * gmx_restrict xyz,
188 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
189 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
190 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
192 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
194 t1 = _mm_loadu_pd(xyz);
195 t2 = _mm_loadu_pd(xyz+2);
196 t3 = _mm_loadu_pd(xyz+4);
197 t4 = _mm_loadu_pd(xyz+6);
198 t5 = _mm_load_sd(xyz+8);
200 sxy = _mm_loadu_pd(xyz_shift);
201 sz = _mm_load_sd(xyz_shift+2);
202 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
203 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
205 t1 = _mm_add_pd(t1, sxy);
206 t2 = _mm_add_pd(t2, szx);
207 t3 = _mm_add_pd(t3, syz);
208 t4 = _mm_add_pd(t4, sxy);
209 t5 = _mm_add_sd(t5, sz);
211 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
212 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
213 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
214 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
215 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
216 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
217 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
218 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
219 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
223 static gmx_inline void gmx_simdcall
224 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
225 const double * gmx_restrict xyz,
226 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
227 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
228 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
229 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
231 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
233 t1 = _mm_loadu_pd(xyz);
234 t2 = _mm_loadu_pd(xyz+2);
235 t3 = _mm_loadu_pd(xyz+4);
236 t4 = _mm_loadu_pd(xyz+6);
237 t5 = _mm_loadu_pd(xyz+8);
238 t6 = _mm_loadu_pd(xyz+10);
240 sxy = _mm_loadu_pd(xyz_shift);
241 sz = _mm_load_sd(xyz_shift+2);
242 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
243 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
245 t1 = _mm_add_pd(t1, sxy);
246 t2 = _mm_add_pd(t2, szx);
247 t3 = _mm_add_pd(t3, syz);
248 t4 = _mm_add_pd(t4, sxy);
249 t5 = _mm_add_pd(t5, szx);
250 t6 = _mm_add_pd(t6, syz);
252 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
253 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
254 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
255 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
256 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
257 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
258 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
259 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
260 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
261 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
262 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
263 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
268 static gmx_inline void gmx_simdcall
269 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
270 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
272 *x = _mm_load_sd(p1);
273 *y = _mm_load_sd(p1+1);
274 *z = _mm_load_sd(p1+2);
277 static gmx_inline void gmx_simdcall
278 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
279 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
280 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
281 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
283 *x1 = _mm_load_sd(p1);
284 *y1 = _mm_load_sd(p1+1);
285 *z1 = _mm_load_sd(p1+2);
286 *x2 = _mm_load_sd(p1+3);
287 *y2 = _mm_load_sd(p1+4);
288 *z2 = _mm_load_sd(p1+5);
289 *x3 = _mm_load_sd(p1+6);
290 *y3 = _mm_load_sd(p1+7);
291 *z3 = _mm_load_sd(p1+8);
294 static gmx_inline void gmx_simdcall
295 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
296 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
297 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
298 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
299 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
301 *x1 = _mm_load_sd(p1);
302 *y1 = _mm_load_sd(p1+1);
303 *z1 = _mm_load_sd(p1+2);
304 *x2 = _mm_load_sd(p1+3);
305 *y2 = _mm_load_sd(p1+4);
306 *z2 = _mm_load_sd(p1+5);
307 *x3 = _mm_load_sd(p1+6);
308 *y3 = _mm_load_sd(p1+7);
309 *z3 = _mm_load_sd(p1+8);
310 *x4 = _mm_load_sd(p1+9);
311 *y4 = _mm_load_sd(p1+10);
312 *z4 = _mm_load_sd(p1+11);
316 static gmx_inline void gmx_simdcall
317 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
318 const double * gmx_restrict ptrB,
319 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
321 __m128d t1, t2, t3, t4;
322 t1 = _mm_loadu_pd(ptrA);
323 t2 = _mm_loadu_pd(ptrB);
324 t3 = _mm_load_sd(ptrA+2);
325 t4 = _mm_load_sd(ptrB+2);
326 GMX_MM_TRANSPOSE2_PD(t1, t2);
329 *z1 = _mm_unpacklo_pd(t3, t4);
332 static gmx_inline void gmx_simdcall
333 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
334 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
335 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
336 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
338 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
339 t1 = _mm_loadu_pd(ptrA);
340 t2 = _mm_loadu_pd(ptrB);
341 t3 = _mm_loadu_pd(ptrA+2);
342 t4 = _mm_loadu_pd(ptrB+2);
343 t5 = _mm_loadu_pd(ptrA+4);
344 t6 = _mm_loadu_pd(ptrB+4);
345 t7 = _mm_loadu_pd(ptrA+6);
346 t8 = _mm_loadu_pd(ptrB+6);
347 t9 = _mm_load_sd(ptrA+8);
348 t10 = _mm_load_sd(ptrB+8);
349 GMX_MM_TRANSPOSE2_PD(t1, t2);
350 GMX_MM_TRANSPOSE2_PD(t3, t4);
351 GMX_MM_TRANSPOSE2_PD(t5, t6);
352 GMX_MM_TRANSPOSE2_PD(t7, t8);
361 *z3 = _mm_unpacklo_pd(t9, t10);
365 static gmx_inline void gmx_simdcall
366 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
367 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
368 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
369 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
370 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
372 __m128d t1, t2, t3, t4, t5, t6;
373 t1 = _mm_loadu_pd(ptrA);
374 t2 = _mm_loadu_pd(ptrB);
375 t3 = _mm_loadu_pd(ptrA+2);
376 t4 = _mm_loadu_pd(ptrB+2);
377 t5 = _mm_loadu_pd(ptrA+4);
378 t6 = _mm_loadu_pd(ptrB+4);
379 GMX_MM_TRANSPOSE2_PD(t1, t2);
380 GMX_MM_TRANSPOSE2_PD(t3, t4);
381 GMX_MM_TRANSPOSE2_PD(t5, t6);
388 t1 = _mm_loadu_pd(ptrA+6);
389 t2 = _mm_loadu_pd(ptrB+6);
390 t3 = _mm_loadu_pd(ptrA+8);
391 t4 = _mm_loadu_pd(ptrB+8);
392 t5 = _mm_loadu_pd(ptrA+10);
393 t6 = _mm_loadu_pd(ptrB+10);
394 GMX_MM_TRANSPOSE2_PD(t1, t2);
395 GMX_MM_TRANSPOSE2_PD(t3, t4);
396 GMX_MM_TRANSPOSE2_PD(t5, t6);
406 /* Routines to decrement rvec in memory, typically use for j particle force updates */
407 static gmx_inline void gmx_simdcall
408 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
409 __m128d x1, __m128d y1, __m128d z1)
413 t1 = _mm_load_sd(ptrA);
414 t2 = _mm_load_sd(ptrA+1);
415 t3 = _mm_load_sd(ptrA+2);
417 t1 = _mm_sub_sd(t1, x1);
418 t2 = _mm_sub_sd(t2, y1);
419 t3 = _mm_sub_sd(t3, z1);
420 _mm_store_sd(ptrA, t1);
421 _mm_store_sd(ptrA+1, t2);
422 _mm_store_sd(ptrA+2, t3);
427 static gmx_inline void gmx_simdcall
428 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
429 __m128d x1, __m128d y1, __m128d z1,
430 __m128d x2, __m128d y2, __m128d z2,
431 __m128d x3, __m128d y3, __m128d z3)
433 __m128d t1, t2, t3, t4, t5;
435 t1 = _mm_loadu_pd(ptrA);
436 t2 = _mm_loadu_pd(ptrA+2);
437 t3 = _mm_loadu_pd(ptrA+4);
438 t4 = _mm_loadu_pd(ptrA+6);
439 t5 = _mm_load_sd(ptrA+8);
441 x1 = _mm_unpacklo_pd(x1, y1);
442 z1 = _mm_unpacklo_pd(z1, x2);
443 y2 = _mm_unpacklo_pd(y2, z2);
444 x3 = _mm_unpacklo_pd(x3, y3);
445 /* nothing to be done for z3 */
447 t1 = _mm_sub_pd(t1, x1);
448 t2 = _mm_sub_pd(t2, z1);
449 t3 = _mm_sub_pd(t3, y2);
450 t4 = _mm_sub_pd(t4, x3);
451 t5 = _mm_sub_sd(t5, z3);
452 _mm_storeu_pd(ptrA, t1);
453 _mm_storeu_pd(ptrA+2, t2);
454 _mm_storeu_pd(ptrA+4, t3);
455 _mm_storeu_pd(ptrA+6, t4);
456 _mm_store_sd(ptrA+8, t5);
460 static gmx_inline void gmx_simdcall
461 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
462 __m128d x1, __m128d y1, __m128d z1,
463 __m128d x2, __m128d y2, __m128d z2,
464 __m128d x3, __m128d y3, __m128d z3,
465 __m128d x4, __m128d y4, __m128d z4)
467 __m128d t1, t2, t3, t4, t5, t6;
469 t1 = _mm_loadu_pd(ptrA);
470 t2 = _mm_loadu_pd(ptrA+2);
471 t3 = _mm_loadu_pd(ptrA+4);
472 t4 = _mm_loadu_pd(ptrA+6);
473 t5 = _mm_loadu_pd(ptrA+8);
474 t6 = _mm_loadu_pd(ptrA+10);
476 x1 = _mm_unpacklo_pd(x1, y1);
477 z1 = _mm_unpacklo_pd(z1, x2);
478 y2 = _mm_unpacklo_pd(y2, z2);
479 x3 = _mm_unpacklo_pd(x3, y3);
480 z3 = _mm_unpacklo_pd(z3, x4);
481 y4 = _mm_unpacklo_pd(y4, z4);
483 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
484 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
485 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
486 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
487 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
488 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
492 static gmx_inline void gmx_simdcall
493 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
494 __m128d x1, __m128d y1, __m128d z1)
496 __m128d t1, t2, t3, t4, t5, t6, t7;
498 t1 = _mm_loadu_pd(ptrA);
499 t2 = _mm_load_sd(ptrA+2);
500 t3 = _mm_loadu_pd(ptrB);
501 t4 = _mm_load_sd(ptrB+2);
503 t5 = _mm_unpacklo_pd(x1, y1);
504 t6 = _mm_unpackhi_pd(x1, y1);
505 t7 = _mm_unpackhi_pd(z1, z1);
507 t1 = _mm_sub_pd(t1, t5);
508 t2 = _mm_sub_sd(t2, z1);
510 t3 = _mm_sub_pd(t3, t6);
511 t4 = _mm_sub_sd(t4, t7);
513 _mm_storeu_pd(ptrA, t1);
514 _mm_store_sd(ptrA+2, t2);
515 _mm_storeu_pd(ptrB, t3);
516 _mm_store_sd(ptrB+2, t4);
521 static gmx_inline void gmx_simdcall
522 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
523 __m128d x1, __m128d y1, __m128d z1,
524 __m128d x2, __m128d y2, __m128d z2,
525 __m128d x3, __m128d y3, __m128d z3)
527 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
528 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
530 t1 = _mm_loadu_pd(ptrA);
531 t2 = _mm_loadu_pd(ptrA+2);
532 t3 = _mm_loadu_pd(ptrA+4);
533 t4 = _mm_loadu_pd(ptrA+6);
534 t5 = _mm_load_sd(ptrA+8);
535 t6 = _mm_loadu_pd(ptrB);
536 t7 = _mm_loadu_pd(ptrB+2);
537 t8 = _mm_loadu_pd(ptrB+4);
538 t9 = _mm_loadu_pd(ptrB+6);
539 t10 = _mm_load_sd(ptrB+8);
541 tA = _mm_unpacklo_pd(x1, y1);
542 tB = _mm_unpackhi_pd(x1, y1);
543 tC = _mm_unpacklo_pd(z1, x2);
544 tD = _mm_unpackhi_pd(z1, x2);
545 tE = _mm_unpacklo_pd(y2, z2);
546 tF = _mm_unpackhi_pd(y2, z2);
547 tG = _mm_unpacklo_pd(x3, y3);
548 tH = _mm_unpackhi_pd(x3, y3);
549 tI = _mm_unpackhi_pd(z3, z3);
551 t1 = _mm_sub_pd(t1, tA);
552 t2 = _mm_sub_pd(t2, tC);
553 t3 = _mm_sub_pd(t3, tE);
554 t4 = _mm_sub_pd(t4, tG);
555 t5 = _mm_sub_sd(t5, z3);
557 t6 = _mm_sub_pd(t6, tB);
558 t7 = _mm_sub_pd(t7, tD);
559 t8 = _mm_sub_pd(t8, tF);
560 t9 = _mm_sub_pd(t9, tH);
561 t10 = _mm_sub_sd(t10, tI);
563 _mm_storeu_pd(ptrA, t1);
564 _mm_storeu_pd(ptrA+2, t2);
565 _mm_storeu_pd(ptrA+4, t3);
566 _mm_storeu_pd(ptrA+6, t4);
567 _mm_store_sd(ptrA+8, t5);
568 _mm_storeu_pd(ptrB, t6);
569 _mm_storeu_pd(ptrB+2, t7);
570 _mm_storeu_pd(ptrB+4, t8);
571 _mm_storeu_pd(ptrB+6, t9);
572 _mm_store_sd(ptrB+8, t10);
576 static gmx_inline void gmx_simdcall
577 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
578 __m128d x1, __m128d y1, __m128d z1,
579 __m128d x2, __m128d y2, __m128d z2,
580 __m128d x3, __m128d y3, __m128d z3,
581 __m128d x4, __m128d y4, __m128d z4)
583 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
584 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
586 t1 = _mm_loadu_pd(ptrA);
587 t2 = _mm_loadu_pd(ptrA+2);
588 t3 = _mm_loadu_pd(ptrA+4);
589 t4 = _mm_loadu_pd(ptrA+6);
590 t5 = _mm_loadu_pd(ptrA+8);
591 t6 = _mm_loadu_pd(ptrA+10);
592 t7 = _mm_loadu_pd(ptrB);
593 t8 = _mm_loadu_pd(ptrB+2);
594 t9 = _mm_loadu_pd(ptrB+4);
595 t10 = _mm_loadu_pd(ptrB+6);
596 t11 = _mm_loadu_pd(ptrB+8);
597 t12 = _mm_loadu_pd(ptrB+10);
599 tA = _mm_unpacklo_pd(x1, y1);
600 tB = _mm_unpackhi_pd(x1, y1);
601 tC = _mm_unpacklo_pd(z1, x2);
602 tD = _mm_unpackhi_pd(z1, x2);
603 tE = _mm_unpacklo_pd(y2, z2);
604 tF = _mm_unpackhi_pd(y2, z2);
605 tG = _mm_unpacklo_pd(x3, y3);
606 tH = _mm_unpackhi_pd(x3, y3);
607 tI = _mm_unpacklo_pd(z3, x4);
608 tJ = _mm_unpackhi_pd(z3, x4);
609 tK = _mm_unpacklo_pd(y4, z4);
610 tL = _mm_unpackhi_pd(y4, z4);
612 t1 = _mm_sub_pd(t1, tA);
613 t2 = _mm_sub_pd(t2, tC);
614 t3 = _mm_sub_pd(t3, tE);
615 t4 = _mm_sub_pd(t4, tG);
616 t5 = _mm_sub_pd(t5, tI);
617 t6 = _mm_sub_pd(t6, tK);
619 t7 = _mm_sub_pd(t7, tB);
620 t8 = _mm_sub_pd(t8, tD);
621 t9 = _mm_sub_pd(t9, tF);
622 t10 = _mm_sub_pd(t10, tH);
623 t11 = _mm_sub_pd(t11, tJ);
624 t12 = _mm_sub_pd(t12, tL);
626 _mm_storeu_pd(ptrA, t1);
627 _mm_storeu_pd(ptrA+2, t2);
628 _mm_storeu_pd(ptrA+4, t3);
629 _mm_storeu_pd(ptrA+6, t4);
630 _mm_storeu_pd(ptrA+8, t5);
631 _mm_storeu_pd(ptrA+10, t6);
632 _mm_storeu_pd(ptrB, t7);
633 _mm_storeu_pd(ptrB+2, t8);
634 _mm_storeu_pd(ptrB+4, t9);
635 _mm_storeu_pd(ptrB+6, t10);
636 _mm_storeu_pd(ptrB+8, t11);
637 _mm_storeu_pd(ptrB+10, t12);
641 static gmx_inline void gmx_simdcall
642 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
643 double * gmx_restrict fptr,
644 double * gmx_restrict fshiftptr)
646 fix1 = _mm_hadd_pd(fix1, fiy1);
647 fiz1 = _mm_hadd_pd(fiz1, fiz1);
649 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
650 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
652 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
653 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
657 static gmx_inline void gmx_simdcall
658 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
659 __m128d fix2, __m128d fiy2, __m128d fiz2,
660 __m128d fix3, __m128d fiy3, __m128d fiz3,
661 double * gmx_restrict fptr,
662 double * gmx_restrict fshiftptr)
666 fix1 = _mm_hadd_pd(fix1, fiy1);
667 fiz1 = _mm_hadd_pd(fiz1, fix2);
668 fiy2 = _mm_hadd_pd(fiy2, fiz2);
669 fix3 = _mm_hadd_pd(fix3, fiy3);
670 fiz3 = _mm_hadd_pd(fiz3, fiz3);
672 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
673 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
674 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
675 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
676 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
678 fix1 = _mm_add_pd(fix1, fix3);
679 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
680 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
682 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
683 fiz1 = _mm_add_sd(fiz1, fiz3);
684 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
686 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
687 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
691 static gmx_inline void gmx_simdcall
692 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
693 __m128d fix2, __m128d fiy2, __m128d fiz2,
694 __m128d fix3, __m128d fiy3, __m128d fiz3,
695 __m128d fix4, __m128d fiy4, __m128d fiz4,
696 double * gmx_restrict fptr,
697 double * gmx_restrict fshiftptr)
701 fix1 = _mm_hadd_pd(fix1, fiy1);
702 fiz1 = _mm_hadd_pd(fiz1, fix2);
703 fiy2 = _mm_hadd_pd(fiy2, fiz2);
704 fix3 = _mm_hadd_pd(fix3, fiy3);
705 fiz3 = _mm_hadd_pd(fiz3, fix4);
706 fiy4 = _mm_hadd_pd(fiy4, fiz4);
708 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
709 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
710 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
711 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
712 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
713 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
715 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
716 fix1 = _mm_add_pd(fix1, t1);
717 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
718 fix3 = _mm_add_pd(fix3, t2);
719 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
721 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
722 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
723 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
725 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
726 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
730 static gmx_inline void gmx_simdcall
731 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
733 pot1 = _mm_hadd_pd(pot1, pot1);
734 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
737 static gmx_inline void gmx_simdcall
738 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
739 __m128d pot2, double * gmx_restrict ptrB)
741 pot1 = _mm_hadd_pd(pot1, pot2);
742 pot2 = _mm_unpackhi_pd(pot1, pot1);
744 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
745 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
749 #endif /* _kernelutil_x86_avx_128_fma_double_h_ */