2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_128_fma_double_h_
36 #define _kernelutil_x86_avx_128_fma_double_h_
38 #include "gromacs/simd/general_x86_avx_128_fma.h"
42 gmx_mm_any_lt(__m128d a, __m128d b)
44 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
48 static gmx_inline __m128d
49 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
51 return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
54 /* Normal sum of four ymm registers */
55 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
59 /* Load a double value from 1-2 places, merge into xmm register */
63 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
64 const double * gmx_restrict ptrB)
66 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
70 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
72 return _mm_load_sd(ptrA);
77 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
78 double * gmx_restrict ptrB,
83 t2 = _mm_unpackhi_pd(xmm1, xmm1);
84 _mm_store_sd(ptrA, xmm1);
85 _mm_store_sd(ptrB, t2);
89 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
91 _mm_store_sd(ptrA, xmm1);
95 /* Similar to store, but increments value in memory */
97 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
98 double * gmx_restrict ptrB, __m128d xmm1)
102 t1 = _mm_unpackhi_pd(xmm1, xmm1);
103 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
104 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
105 _mm_store_sd(ptrA, xmm1);
106 _mm_store_sd(ptrB, t1);
110 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
114 tmp = gmx_mm_load_1real_pd(ptrA);
115 tmp = _mm_add_sd(tmp, xmm1);
116 gmx_mm_store_1real_pd(ptrA, tmp);
121 static gmx_inline void
122 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
123 const double * gmx_restrict p2,
124 __m128d * gmx_restrict c6,
125 __m128d * gmx_restrict c12)
129 /* The c6/c12 array should be aligned */
130 t1 = _mm_loadu_pd(p1);
131 t2 = _mm_loadu_pd(p2);
132 *c6 = _mm_unpacklo_pd(t1, t2);
133 *c12 = _mm_unpackhi_pd(t1, t2);
136 static gmx_inline void
137 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
138 __m128d * gmx_restrict c6,
139 __m128d * gmx_restrict c12)
141 *c6 = _mm_load_sd(p1);
142 *c12 = _mm_load_sd(p1+1);
146 static gmx_inline void
147 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
148 const double * gmx_restrict xyz,
149 __m128d * gmx_restrict x1,
150 __m128d * gmx_restrict y1,
151 __m128d * gmx_restrict z1)
153 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
155 mem_xy = _mm_loadu_pd(xyz);
156 mem_z = _mm_load_sd(xyz+2);
157 mem_sxy = _mm_loadu_pd(xyz_shift);
158 mem_sz = _mm_load_sd(xyz_shift+2);
160 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
161 mem_z = _mm_add_pd(mem_z, mem_sz);
163 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
164 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
165 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
169 static gmx_inline void
170 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
171 const double * gmx_restrict xyz,
172 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
173 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
174 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
176 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
178 t1 = _mm_loadu_pd(xyz);
179 t2 = _mm_loadu_pd(xyz+2);
180 t3 = _mm_loadu_pd(xyz+4);
181 t4 = _mm_loadu_pd(xyz+6);
182 t5 = _mm_load_sd(xyz+8);
184 sxy = _mm_loadu_pd(xyz_shift);
185 sz = _mm_load_sd(xyz_shift+2);
186 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
187 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
189 t1 = _mm_add_pd(t1, sxy);
190 t2 = _mm_add_pd(t2, szx);
191 t3 = _mm_add_pd(t3, syz);
192 t4 = _mm_add_pd(t4, sxy);
193 t5 = _mm_add_sd(t5, sz);
195 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
196 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
197 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
198 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
199 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
200 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
201 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
202 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
203 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
207 static gmx_inline void
208 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
209 const double * gmx_restrict xyz,
210 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
211 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
212 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
213 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
215 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
217 t1 = _mm_loadu_pd(xyz);
218 t2 = _mm_loadu_pd(xyz+2);
219 t3 = _mm_loadu_pd(xyz+4);
220 t4 = _mm_loadu_pd(xyz+6);
221 t5 = _mm_loadu_pd(xyz+8);
222 t6 = _mm_loadu_pd(xyz+10);
224 sxy = _mm_loadu_pd(xyz_shift);
225 sz = _mm_load_sd(xyz_shift+2);
226 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
227 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
229 t1 = _mm_add_pd(t1, sxy);
230 t2 = _mm_add_pd(t2, szx);
231 t3 = _mm_add_pd(t3, syz);
232 t4 = _mm_add_pd(t4, sxy);
233 t5 = _mm_add_pd(t5, szx);
234 t6 = _mm_add_pd(t6, syz);
236 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
237 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
238 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
239 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
240 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
241 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
242 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
243 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
244 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
245 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
246 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
247 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
252 static gmx_inline void
253 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
254 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
256 *x = _mm_load_sd(p1);
257 *y = _mm_load_sd(p1+1);
258 *z = _mm_load_sd(p1+2);
261 static gmx_inline void
262 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
263 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
264 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
265 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
267 *x1 = _mm_load_sd(p1);
268 *y1 = _mm_load_sd(p1+1);
269 *z1 = _mm_load_sd(p1+2);
270 *x2 = _mm_load_sd(p1+3);
271 *y2 = _mm_load_sd(p1+4);
272 *z2 = _mm_load_sd(p1+5);
273 *x3 = _mm_load_sd(p1+6);
274 *y3 = _mm_load_sd(p1+7);
275 *z3 = _mm_load_sd(p1+8);
278 static gmx_inline void
279 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
280 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
281 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
282 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
283 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
285 *x1 = _mm_load_sd(p1);
286 *y1 = _mm_load_sd(p1+1);
287 *z1 = _mm_load_sd(p1+2);
288 *x2 = _mm_load_sd(p1+3);
289 *y2 = _mm_load_sd(p1+4);
290 *z2 = _mm_load_sd(p1+5);
291 *x3 = _mm_load_sd(p1+6);
292 *y3 = _mm_load_sd(p1+7);
293 *z3 = _mm_load_sd(p1+8);
294 *x4 = _mm_load_sd(p1+9);
295 *y4 = _mm_load_sd(p1+10);
296 *z4 = _mm_load_sd(p1+11);
300 static gmx_inline void
301 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
302 const double * gmx_restrict ptrB,
303 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
305 __m128d t1, t2, t3, t4;
306 t1 = _mm_loadu_pd(ptrA);
307 t2 = _mm_loadu_pd(ptrB);
308 t3 = _mm_load_sd(ptrA+2);
309 t4 = _mm_load_sd(ptrB+2);
310 GMX_MM_TRANSPOSE2_PD(t1, t2);
313 *z1 = _mm_unpacklo_pd(t3, t4);
316 static gmx_inline void
317 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
318 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
319 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
320 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
322 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
323 t1 = _mm_loadu_pd(ptrA);
324 t2 = _mm_loadu_pd(ptrB);
325 t3 = _mm_loadu_pd(ptrA+2);
326 t4 = _mm_loadu_pd(ptrB+2);
327 t5 = _mm_loadu_pd(ptrA+4);
328 t6 = _mm_loadu_pd(ptrB+4);
329 t7 = _mm_loadu_pd(ptrA+6);
330 t8 = _mm_loadu_pd(ptrB+6);
331 t9 = _mm_load_sd(ptrA+8);
332 t10 = _mm_load_sd(ptrB+8);
333 GMX_MM_TRANSPOSE2_PD(t1, t2);
334 GMX_MM_TRANSPOSE2_PD(t3, t4);
335 GMX_MM_TRANSPOSE2_PD(t5, t6);
336 GMX_MM_TRANSPOSE2_PD(t7, t8);
345 *z3 = _mm_unpacklo_pd(t9, t10);
349 static gmx_inline void
350 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
351 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
352 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
353 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
354 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
356 __m128d t1, t2, t3, t4, t5, t6;
357 t1 = _mm_loadu_pd(ptrA);
358 t2 = _mm_loadu_pd(ptrB);
359 t3 = _mm_loadu_pd(ptrA+2);
360 t4 = _mm_loadu_pd(ptrB+2);
361 t5 = _mm_loadu_pd(ptrA+4);
362 t6 = _mm_loadu_pd(ptrB+4);
363 GMX_MM_TRANSPOSE2_PD(t1, t2);
364 GMX_MM_TRANSPOSE2_PD(t3, t4);
365 GMX_MM_TRANSPOSE2_PD(t5, t6);
372 t1 = _mm_loadu_pd(ptrA+6);
373 t2 = _mm_loadu_pd(ptrB+6);
374 t3 = _mm_loadu_pd(ptrA+8);
375 t4 = _mm_loadu_pd(ptrB+8);
376 t5 = _mm_loadu_pd(ptrA+10);
377 t6 = _mm_loadu_pd(ptrB+10);
378 GMX_MM_TRANSPOSE2_PD(t1, t2);
379 GMX_MM_TRANSPOSE2_PD(t3, t4);
380 GMX_MM_TRANSPOSE2_PD(t5, t6);
390 /* Routines to decrement rvec in memory, typically use for j particle force updates */
392 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
393 __m128d x1, __m128d y1, __m128d z1)
397 t1 = _mm_load_sd(ptrA);
398 t2 = _mm_load_sd(ptrA+1);
399 t3 = _mm_load_sd(ptrA+2);
401 t1 = _mm_sub_sd(t1, x1);
402 t2 = _mm_sub_sd(t2, y1);
403 t3 = _mm_sub_sd(t3, z1);
404 _mm_store_sd(ptrA, t1);
405 _mm_store_sd(ptrA+1, t2);
406 _mm_store_sd(ptrA+2, t3);
410 #if defined (_MSC_VER) && defined(_M_IX86)
411 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
412 #define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
414 __m128d _t1, _t2, _t3, _t4, _t5; \
415 _t1 = _mm_loadu_pd(ptrA); \
416 _t2 = _mm_loadu_pd(ptrA+2); \
417 _t3 = _mm_loadu_pd(ptrA+4); \
418 _t4 = _mm_loadu_pd(ptrA+6); \
419 _t5 = _mm_load_sd(ptrA+8); \
420 _x1 = _mm_unpacklo_pd(_x1, _y1); \
421 _z1 = _mm_unpacklo_pd(_z1, _x2); \
422 _y2 = _mm_unpacklo_pd(_y2, _z2); \
423 _x3 = _mm_unpacklo_pd(_x3, _y3); \
424 _t1 = _mm_sub_pd(_t1, _x1); \
425 _t2 = _mm_sub_pd(_t2, _z1); \
426 _t3 = _mm_sub_pd(_t3, _y2); \
427 _t4 = _mm_sub_pd(_t4, _x3); \
428 _t5 = _mm_sub_sd(_t5, _z3); \
429 _mm_storeu_pd(ptrA, _t1); \
430 _mm_storeu_pd(ptrA+2, _t2); \
431 _mm_storeu_pd(ptrA+4, _t3); \
432 _mm_storeu_pd(ptrA+6, _t4); \
433 _mm_store_sd(ptrA+8, _t5); \
436 /* Real function for sane compilers */
438 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
439 __m128d x1, __m128d y1, __m128d z1,
440 __m128d x2, __m128d y2, __m128d z2,
441 __m128d x3, __m128d y3, __m128d z3)
443 __m128d t1, t2, t3, t4, t5;
445 t1 = _mm_loadu_pd(ptrA);
446 t2 = _mm_loadu_pd(ptrA+2);
447 t3 = _mm_loadu_pd(ptrA+4);
448 t4 = _mm_loadu_pd(ptrA+6);
449 t5 = _mm_load_sd(ptrA+8);
451 x1 = _mm_unpacklo_pd(x1, y1);
452 z1 = _mm_unpacklo_pd(z1, x2);
453 y2 = _mm_unpacklo_pd(y2, z2);
454 x3 = _mm_unpacklo_pd(x3, y3);
455 /* nothing to be done for z3 */
457 t1 = _mm_sub_pd(t1, x1);
458 t2 = _mm_sub_pd(t2, z1);
459 t3 = _mm_sub_pd(t3, y2);
460 t4 = _mm_sub_pd(t4, x3);
461 t5 = _mm_sub_sd(t5, z3);
462 _mm_storeu_pd(ptrA, t1);
463 _mm_storeu_pd(ptrA+2, t2);
464 _mm_storeu_pd(ptrA+4, t3);
465 _mm_storeu_pd(ptrA+6, t4);
466 _mm_store_sd(ptrA+8, t5);
471 #if defined (_MSC_VER) && defined(_M_IX86)
472 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
473 #define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
475 __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
476 _t1 = _mm_loadu_pd(ptrA); \
477 _t2 = _mm_loadu_pd(ptrA+2); \
478 _t3 = _mm_loadu_pd(ptrA+4); \
479 _t4 = _mm_loadu_pd(ptrA+6); \
480 _t5 = _mm_loadu_pd(ptrA+8); \
481 _t6 = _mm_loadu_pd(ptrA+10); \
482 _x1 = _mm_unpacklo_pd(_x1, _y1); \
483 _z1 = _mm_unpacklo_pd(_z1, _x2); \
484 _y2 = _mm_unpacklo_pd(_y2, _z2); \
485 _x3 = _mm_unpacklo_pd(_x3, _y3); \
486 _z3 = _mm_unpacklo_pd(_z3, _x4); \
487 _y4 = _mm_unpacklo_pd(_y4, _z4); \
488 _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
489 _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
490 _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
491 _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
492 _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
493 _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
496 /* Real function for sane compilers */
498 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
499 __m128d x1, __m128d y1, __m128d z1,
500 __m128d x2, __m128d y2, __m128d z2,
501 __m128d x3, __m128d y3, __m128d z3,
502 __m128d x4, __m128d y4, __m128d z4)
504 __m128d t1, t2, t3, t4, t5, t6;
506 t1 = _mm_loadu_pd(ptrA);
507 t2 = _mm_loadu_pd(ptrA+2);
508 t3 = _mm_loadu_pd(ptrA+4);
509 t4 = _mm_loadu_pd(ptrA+6);
510 t5 = _mm_loadu_pd(ptrA+8);
511 t6 = _mm_loadu_pd(ptrA+10);
513 x1 = _mm_unpacklo_pd(x1, y1);
514 z1 = _mm_unpacklo_pd(z1, x2);
515 y2 = _mm_unpacklo_pd(y2, z2);
516 x3 = _mm_unpacklo_pd(x3, y3);
517 z3 = _mm_unpacklo_pd(z3, x4);
518 y4 = _mm_unpacklo_pd(y4, z4);
520 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
521 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
522 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
523 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
524 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
525 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
531 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
532 __m128d x1, __m128d y1, __m128d z1)
534 __m128d t1, t2, t3, t4, t5, t6, t7;
536 t1 = _mm_loadu_pd(ptrA);
537 t2 = _mm_load_sd(ptrA+2);
538 t3 = _mm_loadu_pd(ptrB);
539 t4 = _mm_load_sd(ptrB+2);
541 t5 = _mm_unpacklo_pd(x1, y1);
542 t6 = _mm_unpackhi_pd(x1, y1);
543 t7 = _mm_unpackhi_pd(z1, z1);
545 t1 = _mm_sub_pd(t1, t5);
546 t2 = _mm_sub_sd(t2, z1);
548 t3 = _mm_sub_pd(t3, t6);
549 t4 = _mm_sub_sd(t4, t7);
551 _mm_storeu_pd(ptrA, t1);
552 _mm_store_sd(ptrA+2, t2);
553 _mm_storeu_pd(ptrB, t3);
554 _mm_store_sd(ptrB+2, t4);
558 #if defined (_MSC_VER) && defined(_M_IX86)
559 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
560 #define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
562 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
563 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
564 _t1 = _mm_loadu_pd(ptrA); \
565 _t2 = _mm_loadu_pd(ptrA+2); \
566 _t3 = _mm_loadu_pd(ptrA+4); \
567 _t4 = _mm_loadu_pd(ptrA+6); \
568 _t5 = _mm_load_sd(ptrA+8); \
569 _t6 = _mm_loadu_pd(ptrB); \
570 _t7 = _mm_loadu_pd(ptrB+2); \
571 _t8 = _mm_loadu_pd(ptrB+4); \
572 _t9 = _mm_loadu_pd(ptrB+6); \
573 _t10 = _mm_load_sd(ptrB+8); \
574 _tA = _mm_unpacklo_pd(_x1, _y1); \
575 _tB = _mm_unpackhi_pd(_x1, _y1); \
576 _tC = _mm_unpacklo_pd(_z1, _x2); \
577 _tD = _mm_unpackhi_pd(_z1, _x2); \
578 _tE = _mm_unpacklo_pd(_y2, _z2); \
579 _tF = _mm_unpackhi_pd(_y2, _z2); \
580 _tG = _mm_unpacklo_pd(_x3, _y3); \
581 _tH = _mm_unpackhi_pd(_x3, _y3); \
582 _tI = _mm_unpackhi_pd(_z3, _z3); \
583 _t1 = _mm_sub_pd(_t1, _tA); \
584 _t2 = _mm_sub_pd(_t2, _tC); \
585 _t3 = _mm_sub_pd(_t3, _tE); \
586 _t4 = _mm_sub_pd(_t4, _tG); \
587 _t5 = _mm_sub_sd(_t5, _z3); \
588 _t6 = _mm_sub_pd(_t6, _tB); \
589 _t7 = _mm_sub_pd(_t7, _tD); \
590 _t8 = _mm_sub_pd(_t8, _tF); \
591 _t9 = _mm_sub_pd(_t9, _tH); \
592 _t10 = _mm_sub_sd(_t10, _tI); \
593 _mm_storeu_pd(ptrA, _t1); \
594 _mm_storeu_pd(ptrA+2, _t2); \
595 _mm_storeu_pd(ptrA+4, _t3); \
596 _mm_storeu_pd(ptrA+6, _t4); \
597 _mm_store_sd(ptrA+8, _t5); \
598 _mm_storeu_pd(ptrB, _t6); \
599 _mm_storeu_pd(ptrB+2, _t7); \
600 _mm_storeu_pd(ptrB+4, _t8); \
601 _mm_storeu_pd(ptrB+6, _t9); \
602 _mm_store_sd(ptrB+8, _t10); \
605 /* Real function for sane compilers */
607 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
608 __m128d x1, __m128d y1, __m128d z1,
609 __m128d x2, __m128d y2, __m128d z2,
610 __m128d x3, __m128d y3, __m128d z3)
612 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
613 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
615 t1 = _mm_loadu_pd(ptrA);
616 t2 = _mm_loadu_pd(ptrA+2);
617 t3 = _mm_loadu_pd(ptrA+4);
618 t4 = _mm_loadu_pd(ptrA+6);
619 t5 = _mm_load_sd(ptrA+8);
620 t6 = _mm_loadu_pd(ptrB);
621 t7 = _mm_loadu_pd(ptrB+2);
622 t8 = _mm_loadu_pd(ptrB+4);
623 t9 = _mm_loadu_pd(ptrB+6);
624 t10 = _mm_load_sd(ptrB+8);
626 tA = _mm_unpacklo_pd(x1, y1);
627 tB = _mm_unpackhi_pd(x1, y1);
628 tC = _mm_unpacklo_pd(z1, x2);
629 tD = _mm_unpackhi_pd(z1, x2);
630 tE = _mm_unpacklo_pd(y2, z2);
631 tF = _mm_unpackhi_pd(y2, z2);
632 tG = _mm_unpacklo_pd(x3, y3);
633 tH = _mm_unpackhi_pd(x3, y3);
634 tI = _mm_unpackhi_pd(z3, z3);
636 t1 = _mm_sub_pd(t1, tA);
637 t2 = _mm_sub_pd(t2, tC);
638 t3 = _mm_sub_pd(t3, tE);
639 t4 = _mm_sub_pd(t4, tG);
640 t5 = _mm_sub_sd(t5, z3);
642 t6 = _mm_sub_pd(t6, tB);
643 t7 = _mm_sub_pd(t7, tD);
644 t8 = _mm_sub_pd(t8, tF);
645 t9 = _mm_sub_pd(t9, tH);
646 t10 = _mm_sub_sd(t10, tI);
648 _mm_storeu_pd(ptrA, t1);
649 _mm_storeu_pd(ptrA+2, t2);
650 _mm_storeu_pd(ptrA+4, t3);
651 _mm_storeu_pd(ptrA+6, t4);
652 _mm_store_sd(ptrA+8, t5);
653 _mm_storeu_pd(ptrB, t6);
654 _mm_storeu_pd(ptrB+2, t7);
655 _mm_storeu_pd(ptrB+4, t8);
656 _mm_storeu_pd(ptrB+6, t9);
657 _mm_store_sd(ptrB+8, t10);
662 #if defined (_MSC_VER) && defined(_M_IX86)
663 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
664 #define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
666 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
667 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
668 _t1 = _mm_loadu_pd(ptrA); \
669 _t2 = _mm_loadu_pd(ptrA+2); \
670 _t3 = _mm_loadu_pd(ptrA+4); \
671 _t4 = _mm_loadu_pd(ptrA+6); \
672 _t5 = _mm_loadu_pd(ptrA+8); \
673 _t6 = _mm_loadu_pd(ptrA+10); \
674 _t7 = _mm_loadu_pd(ptrB); \
675 _t8 = _mm_loadu_pd(ptrB+2); \
676 _t9 = _mm_loadu_pd(ptrB+4); \
677 _t10 = _mm_loadu_pd(ptrB+6); \
678 _t11 = _mm_loadu_pd(ptrB+8); \
679 _t12 = _mm_loadu_pd(ptrB+10); \
680 _tA = _mm_unpacklo_pd(_x1, _y1); \
681 _tB = _mm_unpackhi_pd(_x1, _y1); \
682 _tC = _mm_unpacklo_pd(_z1, _x2); \
683 _tD = _mm_unpackhi_pd(_z1, _x2); \
684 _tE = _mm_unpacklo_pd(_y2, _z2); \
685 _tF = _mm_unpackhi_pd(_y2, _z2); \
686 _tG = _mm_unpacklo_pd(_x3, _y3); \
687 _tH = _mm_unpackhi_pd(_x3, _y3); \
688 _tI = _mm_unpacklo_pd(_z3, _x4); \
689 _tJ = _mm_unpackhi_pd(_z3, _x4); \
690 _tK = _mm_unpacklo_pd(_y4, _z4); \
691 _tL = _mm_unpackhi_pd(_y4, _z4); \
692 _t1 = _mm_sub_pd(_t1, _tA); \
693 _t2 = _mm_sub_pd(_t2, _tC); \
694 _t3 = _mm_sub_pd(_t3, _tE); \
695 _t4 = _mm_sub_pd(_t4, _tG); \
696 _t5 = _mm_sub_pd(_t5, _tI); \
697 _t6 = _mm_sub_pd(_t6, _tK); \
698 _t7 = _mm_sub_pd(_t7, _tB); \
699 _t8 = _mm_sub_pd(_t8, _tD); \
700 _t9 = _mm_sub_pd(_t9, _tF); \
701 _t10 = _mm_sub_pd(_t10, _tH); \
702 _t11 = _mm_sub_pd(_t11, _tJ); \
703 _t12 = _mm_sub_pd(_t12, _tL); \
704 _mm_storeu_pd(ptrA, _t1); \
705 _mm_storeu_pd(ptrA+2, _t2); \
706 _mm_storeu_pd(ptrA+4, _t3); \
707 _mm_storeu_pd(ptrA+6, _t4); \
708 _mm_storeu_pd(ptrA+8, _t5); \
709 _mm_storeu_pd(ptrA+10, _t6); \
710 _mm_storeu_pd(ptrB, _t7); \
711 _mm_storeu_pd(ptrB+2, _t8); \
712 _mm_storeu_pd(ptrB+4, _t9); \
713 _mm_storeu_pd(ptrB+6, _t10); \
714 _mm_storeu_pd(ptrB+8, _t11); \
715 _mm_storeu_pd(ptrB+10, _t12); \
718 /* Real function for sane compilers */
720 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
721 __m128d x1, __m128d y1, __m128d z1,
722 __m128d x2, __m128d y2, __m128d z2,
723 __m128d x3, __m128d y3, __m128d z3,
724 __m128d x4, __m128d y4, __m128d z4)
726 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
727 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
729 t1 = _mm_loadu_pd(ptrA);
730 t2 = _mm_loadu_pd(ptrA+2);
731 t3 = _mm_loadu_pd(ptrA+4);
732 t4 = _mm_loadu_pd(ptrA+6);
733 t5 = _mm_loadu_pd(ptrA+8);
734 t6 = _mm_loadu_pd(ptrA+10);
735 t7 = _mm_loadu_pd(ptrB);
736 t8 = _mm_loadu_pd(ptrB+2);
737 t9 = _mm_loadu_pd(ptrB+4);
738 t10 = _mm_loadu_pd(ptrB+6);
739 t11 = _mm_loadu_pd(ptrB+8);
740 t12 = _mm_loadu_pd(ptrB+10);
742 tA = _mm_unpacklo_pd(x1, y1);
743 tB = _mm_unpackhi_pd(x1, y1);
744 tC = _mm_unpacklo_pd(z1, x2);
745 tD = _mm_unpackhi_pd(z1, x2);
746 tE = _mm_unpacklo_pd(y2, z2);
747 tF = _mm_unpackhi_pd(y2, z2);
748 tG = _mm_unpacklo_pd(x3, y3);
749 tH = _mm_unpackhi_pd(x3, y3);
750 tI = _mm_unpacklo_pd(z3, x4);
751 tJ = _mm_unpackhi_pd(z3, x4);
752 tK = _mm_unpacklo_pd(y4, z4);
753 tL = _mm_unpackhi_pd(y4, z4);
755 t1 = _mm_sub_pd(t1, tA);
756 t2 = _mm_sub_pd(t2, tC);
757 t3 = _mm_sub_pd(t3, tE);
758 t4 = _mm_sub_pd(t4, tG);
759 t5 = _mm_sub_pd(t5, tI);
760 t6 = _mm_sub_pd(t6, tK);
762 t7 = _mm_sub_pd(t7, tB);
763 t8 = _mm_sub_pd(t8, tD);
764 t9 = _mm_sub_pd(t9, tF);
765 t10 = _mm_sub_pd(t10, tH);
766 t11 = _mm_sub_pd(t11, tJ);
767 t12 = _mm_sub_pd(t12, tL);
769 _mm_storeu_pd(ptrA, t1);
770 _mm_storeu_pd(ptrA+2, t2);
771 _mm_storeu_pd(ptrA+4, t3);
772 _mm_storeu_pd(ptrA+6, t4);
773 _mm_storeu_pd(ptrA+8, t5);
774 _mm_storeu_pd(ptrA+10, t6);
775 _mm_storeu_pd(ptrB, t7);
776 _mm_storeu_pd(ptrB+2, t8);
777 _mm_storeu_pd(ptrB+4, t9);
778 _mm_storeu_pd(ptrB+6, t10);
779 _mm_storeu_pd(ptrB+8, t11);
780 _mm_storeu_pd(ptrB+10, t12);
785 static gmx_inline void
786 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
787 double * gmx_restrict fptr,
788 double * gmx_restrict fshiftptr)
790 fix1 = _mm_hadd_pd(fix1, fiy1);
791 fiz1 = _mm_hadd_pd(fiz1, fiz1);
793 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
794 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
796 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
797 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
800 #if defined (_MSC_VER) && defined(_M_IX86)
801 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
802 #define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
806 fix1 = _mm_hadd_pd(fix1, fiy1); \
807 fiz1 = _mm_hadd_pd(fiz1, fix2); \
808 fiy2 = _mm_hadd_pd(fiy2, fiz2); \
809 fix3 = _mm_hadd_pd(fix3, fiy3); \
810 fiz3 = _mm_hadd_pd(fiz3, fiz3); \
811 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
812 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
813 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
814 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
815 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
816 fix1 = _mm_add_pd(fix1, fix3); \
817 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
818 fix1 = _mm_add_pd(fix1, _t1); \
819 _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
820 fiz1 = _mm_add_sd(fiz1, fiz3); \
821 fiz1 = _mm_add_sd(fiz1, _t2); \
822 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
823 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
826 /* Real function for sane compilers */
827 static gmx_inline void
828 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
829 __m128d fix2, __m128d fiy2, __m128d fiz2,
830 __m128d fix3, __m128d fiy3, __m128d fiz3,
831 double * gmx_restrict fptr,
832 double * gmx_restrict fshiftptr)
836 fix1 = _mm_hadd_pd(fix1, fiy1);
837 fiz1 = _mm_hadd_pd(fiz1, fix2);
838 fiy2 = _mm_hadd_pd(fiy2, fiz2);
839 fix3 = _mm_hadd_pd(fix3, fiy3);
840 fiz3 = _mm_hadd_pd(fiz3, fiz3);
842 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
843 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
844 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
845 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
846 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
848 fix1 = _mm_add_pd(fix1, fix3);
849 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
850 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
852 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
853 fiz1 = _mm_add_sd(fiz1, fiz3);
854 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
856 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
857 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
861 #if defined (_MSC_VER) && defined(_M_IX86)
862 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
863 #define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
867 fix1 = _mm_hadd_pd(fix1, fiy1); \
868 fiz1 = _mm_hadd_pd(fiz1, fix2); \
869 fiy2 = _mm_hadd_pd(fiy2, fiz2); \
870 fix3 = _mm_hadd_pd(fix3, fiy3); \
871 fiz3 = _mm_hadd_pd(fiz3, fix4); \
872 fiy4 = _mm_hadd_pd(fiy4, fiz4); \
873 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
874 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
875 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
876 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
877 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
878 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
879 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
880 fix1 = _mm_add_pd(fix1, _t1); \
881 _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
882 fix3 = _mm_add_pd(fix3, _t2); \
883 fix1 = _mm_add_pd(fix1, fix3); \
884 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
885 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
886 fiz1 = _mm_add_sd(fiz1, fiz3); \
887 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
888 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
891 /* Real function for sane compilers */
892 static gmx_inline void
893 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
894 __m128d fix2, __m128d fiy2, __m128d fiz2,
895 __m128d fix3, __m128d fiy3, __m128d fiz3,
896 __m128d fix4, __m128d fiy4, __m128d fiz4,
897 double * gmx_restrict fptr,
898 double * gmx_restrict fshiftptr)
902 fix1 = _mm_hadd_pd(fix1, fiy1);
903 fiz1 = _mm_hadd_pd(fiz1, fix2);
904 fiy2 = _mm_hadd_pd(fiy2, fiz2);
905 fix3 = _mm_hadd_pd(fix3, fiy3);
906 fiz3 = _mm_hadd_pd(fiz3, fix4);
907 fiy4 = _mm_hadd_pd(fiy4, fiz4);
909 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
910 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
911 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
912 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
913 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
914 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
916 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
917 fix1 = _mm_add_pd(fix1, t1);
918 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
919 fix3 = _mm_add_pd(fix3, t2);
920 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
922 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
923 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
924 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
926 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
927 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
932 static gmx_inline void
933 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
935 pot1 = _mm_hadd_pd(pot1, pot1);
936 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
939 static gmx_inline void
940 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
941 __m128d pot2, double * gmx_restrict ptrB)
943 pot1 = _mm_hadd_pd(pot1, pot2);
944 pot2 = _mm_unpackhi_pd(pot1, pot1);
946 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
947 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
951 #endif /* _kernelutil_x86_avx_128_fma_double_h_ */