2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_128_fma_double_h_
36 #define _kernelutil_x86_avx_128_fma_double_h_
39 #include <immintrin.h>
43 # include <x86intrin.h>
46 #define gmx_mm_castsi128_pd _mm_castsi128_pd
47 #define gmx_mm_extract_epi32 _mm_extract_epi32
49 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
50 __m128d __gmx_t1 = row0; \
51 row0 = _mm_unpacklo_pd(row0, row1); \
52 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
56 gmx_mm_any_lt(__m128d a, __m128d b)
58 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
62 static gmx_inline __m128d
63 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
65 return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
68 /* Normal sum of four ymm registers */
69 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
73 /* Load a double value from 1-2 places, merge into xmm register */
77 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
78 const double * gmx_restrict ptrB)
80 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
84 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
86 return _mm_load_sd(ptrA);
91 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
92 double * gmx_restrict ptrB,
97 t2 = _mm_unpackhi_pd(xmm1, xmm1);
98 _mm_store_sd(ptrA, xmm1);
99 _mm_store_sd(ptrB, t2);
103 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
105 _mm_store_sd(ptrA, xmm1);
109 /* Similar to store, but increments value in memory */
111 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
112 double * gmx_restrict ptrB, __m128d xmm1)
116 t1 = _mm_unpackhi_pd(xmm1, xmm1);
117 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
118 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
119 _mm_store_sd(ptrA, xmm1);
120 _mm_store_sd(ptrB, t1);
124 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
128 tmp = gmx_mm_load_1real_pd(ptrA);
129 tmp = _mm_add_sd(tmp, xmm1);
130 gmx_mm_store_1real_pd(ptrA, tmp);
135 static gmx_inline void
136 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
137 const double * gmx_restrict p2,
138 __m128d * gmx_restrict c6,
139 __m128d * gmx_restrict c12)
143 /* The c6/c12 array should be aligned */
144 t1 = _mm_loadu_pd(p1);
145 t2 = _mm_loadu_pd(p2);
146 *c6 = _mm_unpacklo_pd(t1, t2);
147 *c12 = _mm_unpackhi_pd(t1, t2);
150 static gmx_inline void
151 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
152 __m128d * gmx_restrict c6,
153 __m128d * gmx_restrict c12)
155 *c6 = _mm_load_sd(p1);
156 *c12 = _mm_load_sd(p1+1);
160 static gmx_inline void
161 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
162 const double * gmx_restrict xyz,
163 __m128d * gmx_restrict x1,
164 __m128d * gmx_restrict y1,
165 __m128d * gmx_restrict z1)
167 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
169 mem_xy = _mm_loadu_pd(xyz);
170 mem_z = _mm_load_sd(xyz+2);
171 mem_sxy = _mm_loadu_pd(xyz_shift);
172 mem_sz = _mm_load_sd(xyz_shift+2);
174 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
175 mem_z = _mm_add_pd(mem_z, mem_sz);
177 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
178 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
179 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
183 static gmx_inline void
184 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
185 const double * gmx_restrict xyz,
186 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
187 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
188 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
190 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
192 t1 = _mm_loadu_pd(xyz);
193 t2 = _mm_loadu_pd(xyz+2);
194 t3 = _mm_loadu_pd(xyz+4);
195 t4 = _mm_loadu_pd(xyz+6);
196 t5 = _mm_load_sd(xyz+8);
198 sxy = _mm_loadu_pd(xyz_shift);
199 sz = _mm_load_sd(xyz_shift+2);
200 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
201 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
203 t1 = _mm_add_pd(t1, sxy);
204 t2 = _mm_add_pd(t2, szx);
205 t3 = _mm_add_pd(t3, syz);
206 t4 = _mm_add_pd(t4, sxy);
207 t5 = _mm_add_sd(t5, sz);
209 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
210 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
211 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
212 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
213 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
214 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
215 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
216 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
217 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
221 static gmx_inline void
222 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
223 const double * gmx_restrict xyz,
224 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
225 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
226 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
227 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
229 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
231 t1 = _mm_loadu_pd(xyz);
232 t2 = _mm_loadu_pd(xyz+2);
233 t3 = _mm_loadu_pd(xyz+4);
234 t4 = _mm_loadu_pd(xyz+6);
235 t5 = _mm_loadu_pd(xyz+8);
236 t6 = _mm_loadu_pd(xyz+10);
238 sxy = _mm_loadu_pd(xyz_shift);
239 sz = _mm_load_sd(xyz_shift+2);
240 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
241 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
243 t1 = _mm_add_pd(t1, sxy);
244 t2 = _mm_add_pd(t2, szx);
245 t3 = _mm_add_pd(t3, syz);
246 t4 = _mm_add_pd(t4, sxy);
247 t5 = _mm_add_pd(t5, szx);
248 t6 = _mm_add_pd(t6, syz);
250 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
251 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
252 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
253 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
254 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
255 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
256 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
257 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
258 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
259 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
260 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
261 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
266 static gmx_inline void
267 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
268 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
270 *x = _mm_load_sd(p1);
271 *y = _mm_load_sd(p1+1);
272 *z = _mm_load_sd(p1+2);
275 static gmx_inline void
276 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
277 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
278 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
279 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
281 *x1 = _mm_load_sd(p1);
282 *y1 = _mm_load_sd(p1+1);
283 *z1 = _mm_load_sd(p1+2);
284 *x2 = _mm_load_sd(p1+3);
285 *y2 = _mm_load_sd(p1+4);
286 *z2 = _mm_load_sd(p1+5);
287 *x3 = _mm_load_sd(p1+6);
288 *y3 = _mm_load_sd(p1+7);
289 *z3 = _mm_load_sd(p1+8);
292 static gmx_inline void
293 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
294 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
295 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
296 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
297 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
299 *x1 = _mm_load_sd(p1);
300 *y1 = _mm_load_sd(p1+1);
301 *z1 = _mm_load_sd(p1+2);
302 *x2 = _mm_load_sd(p1+3);
303 *y2 = _mm_load_sd(p1+4);
304 *z2 = _mm_load_sd(p1+5);
305 *x3 = _mm_load_sd(p1+6);
306 *y3 = _mm_load_sd(p1+7);
307 *z3 = _mm_load_sd(p1+8);
308 *x4 = _mm_load_sd(p1+9);
309 *y4 = _mm_load_sd(p1+10);
310 *z4 = _mm_load_sd(p1+11);
314 static gmx_inline void
315 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
316 const double * gmx_restrict ptrB,
317 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
319 __m128d t1, t2, t3, t4;
320 t1 = _mm_loadu_pd(ptrA);
321 t2 = _mm_loadu_pd(ptrB);
322 t3 = _mm_load_sd(ptrA+2);
323 t4 = _mm_load_sd(ptrB+2);
324 GMX_MM_TRANSPOSE2_PD(t1, t2);
327 *z1 = _mm_unpacklo_pd(t3, t4);
330 static gmx_inline void
331 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
332 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
333 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
334 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
336 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
337 t1 = _mm_loadu_pd(ptrA);
338 t2 = _mm_loadu_pd(ptrB);
339 t3 = _mm_loadu_pd(ptrA+2);
340 t4 = _mm_loadu_pd(ptrB+2);
341 t5 = _mm_loadu_pd(ptrA+4);
342 t6 = _mm_loadu_pd(ptrB+4);
343 t7 = _mm_loadu_pd(ptrA+6);
344 t8 = _mm_loadu_pd(ptrB+6);
345 t9 = _mm_load_sd(ptrA+8);
346 t10 = _mm_load_sd(ptrB+8);
347 GMX_MM_TRANSPOSE2_PD(t1, t2);
348 GMX_MM_TRANSPOSE2_PD(t3, t4);
349 GMX_MM_TRANSPOSE2_PD(t5, t6);
350 GMX_MM_TRANSPOSE2_PD(t7, t8);
359 *z3 = _mm_unpacklo_pd(t9, t10);
363 static gmx_inline void
364 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
365 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
366 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
367 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
368 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
370 __m128d t1, t2, t3, t4, t5, t6;
371 t1 = _mm_loadu_pd(ptrA);
372 t2 = _mm_loadu_pd(ptrB);
373 t3 = _mm_loadu_pd(ptrA+2);
374 t4 = _mm_loadu_pd(ptrB+2);
375 t5 = _mm_loadu_pd(ptrA+4);
376 t6 = _mm_loadu_pd(ptrB+4);
377 GMX_MM_TRANSPOSE2_PD(t1, t2);
378 GMX_MM_TRANSPOSE2_PD(t3, t4);
379 GMX_MM_TRANSPOSE2_PD(t5, t6);
386 t1 = _mm_loadu_pd(ptrA+6);
387 t2 = _mm_loadu_pd(ptrB+6);
388 t3 = _mm_loadu_pd(ptrA+8);
389 t4 = _mm_loadu_pd(ptrB+8);
390 t5 = _mm_loadu_pd(ptrA+10);
391 t6 = _mm_loadu_pd(ptrB+10);
392 GMX_MM_TRANSPOSE2_PD(t1, t2);
393 GMX_MM_TRANSPOSE2_PD(t3, t4);
394 GMX_MM_TRANSPOSE2_PD(t5, t6);
404 /* Routines to decrement rvec in memory, typically use for j particle force updates */
406 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
407 __m128d x1, __m128d y1, __m128d z1)
411 t1 = _mm_load_sd(ptrA);
412 t2 = _mm_load_sd(ptrA+1);
413 t3 = _mm_load_sd(ptrA+2);
415 t1 = _mm_sub_sd(t1, x1);
416 t2 = _mm_sub_sd(t2, y1);
417 t3 = _mm_sub_sd(t3, z1);
418 _mm_store_sd(ptrA, t1);
419 _mm_store_sd(ptrA+1, t2);
420 _mm_store_sd(ptrA+2, t3);
424 #if defined (_MSC_VER) && defined(_M_IX86)
425 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
426 #define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
428 __m128d _t1, _t2, _t3, _t4, _t5; \
429 _t1 = _mm_loadu_pd(ptrA); \
430 _t2 = _mm_loadu_pd(ptrA+2); \
431 _t3 = _mm_loadu_pd(ptrA+4); \
432 _t4 = _mm_loadu_pd(ptrA+6); \
433 _t5 = _mm_load_sd(ptrA+8); \
434 _x1 = _mm_unpacklo_pd(_x1, _y1); \
435 _z1 = _mm_unpacklo_pd(_z1, _x2); \
436 _y2 = _mm_unpacklo_pd(_y2, _z2); \
437 _x3 = _mm_unpacklo_pd(_x3, _y3); \
438 _t1 = _mm_sub_pd(_t1, _x1); \
439 _t2 = _mm_sub_pd(_t2, _z1); \
440 _t3 = _mm_sub_pd(_t3, _y2); \
441 _t4 = _mm_sub_pd(_t4, _x3); \
442 _t5 = _mm_sub_sd(_t5, _z3); \
443 _mm_storeu_pd(ptrA, _t1); \
444 _mm_storeu_pd(ptrA+2, _t2); \
445 _mm_storeu_pd(ptrA+4, _t3); \
446 _mm_storeu_pd(ptrA+6, _t4); \
447 _mm_store_sd(ptrA+8, _t5); \
450 /* Real function for sane compilers */
452 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
453 __m128d x1, __m128d y1, __m128d z1,
454 __m128d x2, __m128d y2, __m128d z2,
455 __m128d x3, __m128d y3, __m128d z3)
457 __m128d t1, t2, t3, t4, t5;
459 t1 = _mm_loadu_pd(ptrA);
460 t2 = _mm_loadu_pd(ptrA+2);
461 t3 = _mm_loadu_pd(ptrA+4);
462 t4 = _mm_loadu_pd(ptrA+6);
463 t5 = _mm_load_sd(ptrA+8);
465 x1 = _mm_unpacklo_pd(x1, y1);
466 z1 = _mm_unpacklo_pd(z1, x2);
467 y2 = _mm_unpacklo_pd(y2, z2);
468 x3 = _mm_unpacklo_pd(x3, y3);
469 /* nothing to be done for z3 */
471 t1 = _mm_sub_pd(t1, x1);
472 t2 = _mm_sub_pd(t2, z1);
473 t3 = _mm_sub_pd(t3, y2);
474 t4 = _mm_sub_pd(t4, x3);
475 t5 = _mm_sub_sd(t5, z3);
476 _mm_storeu_pd(ptrA, t1);
477 _mm_storeu_pd(ptrA+2, t2);
478 _mm_storeu_pd(ptrA+4, t3);
479 _mm_storeu_pd(ptrA+6, t4);
480 _mm_store_sd(ptrA+8, t5);
485 #if defined (_MSC_VER) && defined(_M_IX86)
486 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
487 #define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
489 __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
490 _t1 = _mm_loadu_pd(ptrA); \
491 _t2 = _mm_loadu_pd(ptrA+2); \
492 _t3 = _mm_loadu_pd(ptrA+4); \
493 _t4 = _mm_loadu_pd(ptrA+6); \
494 _t5 = _mm_loadu_pd(ptrA+8); \
495 _t6 = _mm_loadu_pd(ptrA+10); \
496 _x1 = _mm_unpacklo_pd(_x1, _y1); \
497 _z1 = _mm_unpacklo_pd(_z1, _x2); \
498 _y2 = _mm_unpacklo_pd(_y2, _z2); \
499 _x3 = _mm_unpacklo_pd(_x3, _y3); \
500 _z3 = _mm_unpacklo_pd(_z3, _x4); \
501 _y4 = _mm_unpacklo_pd(_y4, _z4); \
502 _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
503 _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
504 _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
505 _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
506 _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
507 _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
510 /* Real function for sane compilers */
512 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
513 __m128d x1, __m128d y1, __m128d z1,
514 __m128d x2, __m128d y2, __m128d z2,
515 __m128d x3, __m128d y3, __m128d z3,
516 __m128d x4, __m128d y4, __m128d z4)
518 __m128d t1, t2, t3, t4, t5, t6;
520 t1 = _mm_loadu_pd(ptrA);
521 t2 = _mm_loadu_pd(ptrA+2);
522 t3 = _mm_loadu_pd(ptrA+4);
523 t4 = _mm_loadu_pd(ptrA+6);
524 t5 = _mm_loadu_pd(ptrA+8);
525 t6 = _mm_loadu_pd(ptrA+10);
527 x1 = _mm_unpacklo_pd(x1, y1);
528 z1 = _mm_unpacklo_pd(z1, x2);
529 y2 = _mm_unpacklo_pd(y2, z2);
530 x3 = _mm_unpacklo_pd(x3, y3);
531 z3 = _mm_unpacklo_pd(z3, x4);
532 y4 = _mm_unpacklo_pd(y4, z4);
534 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
535 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
536 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
537 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
538 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
539 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
545 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
546 __m128d x1, __m128d y1, __m128d z1)
548 __m128d t1, t2, t3, t4, t5, t6, t7;
550 t1 = _mm_loadu_pd(ptrA);
551 t2 = _mm_load_sd(ptrA+2);
552 t3 = _mm_loadu_pd(ptrB);
553 t4 = _mm_load_sd(ptrB+2);
555 t5 = _mm_unpacklo_pd(x1, y1);
556 t6 = _mm_unpackhi_pd(x1, y1);
557 t7 = _mm_unpackhi_pd(z1, z1);
559 t1 = _mm_sub_pd(t1, t5);
560 t2 = _mm_sub_sd(t2, z1);
562 t3 = _mm_sub_pd(t3, t6);
563 t4 = _mm_sub_sd(t4, t7);
565 _mm_storeu_pd(ptrA, t1);
566 _mm_store_sd(ptrA+2, t2);
567 _mm_storeu_pd(ptrB, t3);
568 _mm_store_sd(ptrB+2, t4);
572 #if defined (_MSC_VER) && defined(_M_IX86)
573 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
574 #define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
576 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
577 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
578 _t1 = _mm_loadu_pd(ptrA); \
579 _t2 = _mm_loadu_pd(ptrA+2); \
580 _t3 = _mm_loadu_pd(ptrA+4); \
581 _t4 = _mm_loadu_pd(ptrA+6); \
582 _t5 = _mm_load_sd(ptrA+8); \
583 _t6 = _mm_loadu_pd(ptrB); \
584 _t7 = _mm_loadu_pd(ptrB+2); \
585 _t8 = _mm_loadu_pd(ptrB+4); \
586 _t9 = _mm_loadu_pd(ptrB+6); \
587 _t10 = _mm_load_sd(ptrB+8); \
588 _tA = _mm_unpacklo_pd(_x1, _y1); \
589 _tB = _mm_unpackhi_pd(_x1, _y1); \
590 _tC = _mm_unpacklo_pd(_z1, _x2); \
591 _tD = _mm_unpackhi_pd(_z1, _x2); \
592 _tE = _mm_unpacklo_pd(_y2, _z2); \
593 _tF = _mm_unpackhi_pd(_y2, _z2); \
594 _tG = _mm_unpacklo_pd(_x3, _y3); \
595 _tH = _mm_unpackhi_pd(_x3, _y3); \
596 _tI = _mm_unpackhi_pd(_z3, _z3); \
597 _t1 = _mm_sub_pd(_t1, _tA); \
598 _t2 = _mm_sub_pd(_t2, _tC); \
599 _t3 = _mm_sub_pd(_t3, _tE); \
600 _t4 = _mm_sub_pd(_t4, _tG); \
601 _t5 = _mm_sub_sd(_t5, _z3); \
602 _t6 = _mm_sub_pd(_t6, _tB); \
603 _t7 = _mm_sub_pd(_t7, _tD); \
604 _t8 = _mm_sub_pd(_t8, _tF); \
605 _t9 = _mm_sub_pd(_t9, _tH); \
606 _t10 = _mm_sub_sd(_t10, _tI); \
607 _mm_storeu_pd(ptrA, _t1); \
608 _mm_storeu_pd(ptrA+2, _t2); \
609 _mm_storeu_pd(ptrA+4, _t3); \
610 _mm_storeu_pd(ptrA+6, _t4); \
611 _mm_store_sd(ptrA+8, _t5); \
612 _mm_storeu_pd(ptrB, _t6); \
613 _mm_storeu_pd(ptrB+2, _t7); \
614 _mm_storeu_pd(ptrB+4, _t8); \
615 _mm_storeu_pd(ptrB+6, _t9); \
616 _mm_store_sd(ptrB+8, _t10); \
619 /* Real function for sane compilers */
621 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
622 __m128d x1, __m128d y1, __m128d z1,
623 __m128d x2, __m128d y2, __m128d z2,
624 __m128d x3, __m128d y3, __m128d z3)
626 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
627 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
629 t1 = _mm_loadu_pd(ptrA);
630 t2 = _mm_loadu_pd(ptrA+2);
631 t3 = _mm_loadu_pd(ptrA+4);
632 t4 = _mm_loadu_pd(ptrA+6);
633 t5 = _mm_load_sd(ptrA+8);
634 t6 = _mm_loadu_pd(ptrB);
635 t7 = _mm_loadu_pd(ptrB+2);
636 t8 = _mm_loadu_pd(ptrB+4);
637 t9 = _mm_loadu_pd(ptrB+6);
638 t10 = _mm_load_sd(ptrB+8);
640 tA = _mm_unpacklo_pd(x1, y1);
641 tB = _mm_unpackhi_pd(x1, y1);
642 tC = _mm_unpacklo_pd(z1, x2);
643 tD = _mm_unpackhi_pd(z1, x2);
644 tE = _mm_unpacklo_pd(y2, z2);
645 tF = _mm_unpackhi_pd(y2, z2);
646 tG = _mm_unpacklo_pd(x3, y3);
647 tH = _mm_unpackhi_pd(x3, y3);
648 tI = _mm_unpackhi_pd(z3, z3);
650 t1 = _mm_sub_pd(t1, tA);
651 t2 = _mm_sub_pd(t2, tC);
652 t3 = _mm_sub_pd(t3, tE);
653 t4 = _mm_sub_pd(t4, tG);
654 t5 = _mm_sub_sd(t5, z3);
656 t6 = _mm_sub_pd(t6, tB);
657 t7 = _mm_sub_pd(t7, tD);
658 t8 = _mm_sub_pd(t8, tF);
659 t9 = _mm_sub_pd(t9, tH);
660 t10 = _mm_sub_sd(t10, tI);
662 _mm_storeu_pd(ptrA, t1);
663 _mm_storeu_pd(ptrA+2, t2);
664 _mm_storeu_pd(ptrA+4, t3);
665 _mm_storeu_pd(ptrA+6, t4);
666 _mm_store_sd(ptrA+8, t5);
667 _mm_storeu_pd(ptrB, t6);
668 _mm_storeu_pd(ptrB+2, t7);
669 _mm_storeu_pd(ptrB+4, t8);
670 _mm_storeu_pd(ptrB+6, t9);
671 _mm_store_sd(ptrB+8, t10);
676 #if defined (_MSC_VER) && defined(_M_IX86)
677 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
678 #define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
680 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
681 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
682 _t1 = _mm_loadu_pd(ptrA); \
683 _t2 = _mm_loadu_pd(ptrA+2); \
684 _t3 = _mm_loadu_pd(ptrA+4); \
685 _t4 = _mm_loadu_pd(ptrA+6); \
686 _t5 = _mm_loadu_pd(ptrA+8); \
687 _t6 = _mm_loadu_pd(ptrA+10); \
688 _t7 = _mm_loadu_pd(ptrB); \
689 _t8 = _mm_loadu_pd(ptrB+2); \
690 _t9 = _mm_loadu_pd(ptrB+4); \
691 _t10 = _mm_loadu_pd(ptrB+6); \
692 _t11 = _mm_loadu_pd(ptrB+8); \
693 _t12 = _mm_loadu_pd(ptrB+10); \
694 _tA = _mm_unpacklo_pd(_x1, _y1); \
695 _tB = _mm_unpackhi_pd(_x1, _y1); \
696 _tC = _mm_unpacklo_pd(_z1, _x2); \
697 _tD = _mm_unpackhi_pd(_z1, _x2); \
698 _tE = _mm_unpacklo_pd(_y2, _z2); \
699 _tF = _mm_unpackhi_pd(_y2, _z2); \
700 _tG = _mm_unpacklo_pd(_x3, _y3); \
701 _tH = _mm_unpackhi_pd(_x3, _y3); \
702 _tI = _mm_unpacklo_pd(_z3, _x4); \
703 _tJ = _mm_unpackhi_pd(_z3, _x4); \
704 _tK = _mm_unpacklo_pd(_y4, _z4); \
705 _tL = _mm_unpackhi_pd(_y4, _z4); \
706 _t1 = _mm_sub_pd(_t1, _tA); \
707 _t2 = _mm_sub_pd(_t2, _tC); \
708 _t3 = _mm_sub_pd(_t3, _tE); \
709 _t4 = _mm_sub_pd(_t4, _tG); \
710 _t5 = _mm_sub_pd(_t5, _tI); \
711 _t6 = _mm_sub_pd(_t6, _tK); \
712 _t7 = _mm_sub_pd(_t7, _tB); \
713 _t8 = _mm_sub_pd(_t8, _tD); \
714 _t9 = _mm_sub_pd(_t9, _tF); \
715 _t10 = _mm_sub_pd(_t10, _tH); \
716 _t11 = _mm_sub_pd(_t11, _tJ); \
717 _t12 = _mm_sub_pd(_t12, _tL); \
718 _mm_storeu_pd(ptrA, _t1); \
719 _mm_storeu_pd(ptrA+2, _t2); \
720 _mm_storeu_pd(ptrA+4, _t3); \
721 _mm_storeu_pd(ptrA+6, _t4); \
722 _mm_storeu_pd(ptrA+8, _t5); \
723 _mm_storeu_pd(ptrA+10, _t6); \
724 _mm_storeu_pd(ptrB, _t7); \
725 _mm_storeu_pd(ptrB+2, _t8); \
726 _mm_storeu_pd(ptrB+4, _t9); \
727 _mm_storeu_pd(ptrB+6, _t10); \
728 _mm_storeu_pd(ptrB+8, _t11); \
729 _mm_storeu_pd(ptrB+10, _t12); \
732 /* Real function for sane compilers */
734 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
735 __m128d x1, __m128d y1, __m128d z1,
736 __m128d x2, __m128d y2, __m128d z2,
737 __m128d x3, __m128d y3, __m128d z3,
738 __m128d x4, __m128d y4, __m128d z4)
740 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
741 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
743 t1 = _mm_loadu_pd(ptrA);
744 t2 = _mm_loadu_pd(ptrA+2);
745 t3 = _mm_loadu_pd(ptrA+4);
746 t4 = _mm_loadu_pd(ptrA+6);
747 t5 = _mm_loadu_pd(ptrA+8);
748 t6 = _mm_loadu_pd(ptrA+10);
749 t7 = _mm_loadu_pd(ptrB);
750 t8 = _mm_loadu_pd(ptrB+2);
751 t9 = _mm_loadu_pd(ptrB+4);
752 t10 = _mm_loadu_pd(ptrB+6);
753 t11 = _mm_loadu_pd(ptrB+8);
754 t12 = _mm_loadu_pd(ptrB+10);
756 tA = _mm_unpacklo_pd(x1, y1);
757 tB = _mm_unpackhi_pd(x1, y1);
758 tC = _mm_unpacklo_pd(z1, x2);
759 tD = _mm_unpackhi_pd(z1, x2);
760 tE = _mm_unpacklo_pd(y2, z2);
761 tF = _mm_unpackhi_pd(y2, z2);
762 tG = _mm_unpacklo_pd(x3, y3);
763 tH = _mm_unpackhi_pd(x3, y3);
764 tI = _mm_unpacklo_pd(z3, x4);
765 tJ = _mm_unpackhi_pd(z3, x4);
766 tK = _mm_unpacklo_pd(y4, z4);
767 tL = _mm_unpackhi_pd(y4, z4);
769 t1 = _mm_sub_pd(t1, tA);
770 t2 = _mm_sub_pd(t2, tC);
771 t3 = _mm_sub_pd(t3, tE);
772 t4 = _mm_sub_pd(t4, tG);
773 t5 = _mm_sub_pd(t5, tI);
774 t6 = _mm_sub_pd(t6, tK);
776 t7 = _mm_sub_pd(t7, tB);
777 t8 = _mm_sub_pd(t8, tD);
778 t9 = _mm_sub_pd(t9, tF);
779 t10 = _mm_sub_pd(t10, tH);
780 t11 = _mm_sub_pd(t11, tJ);
781 t12 = _mm_sub_pd(t12, tL);
783 _mm_storeu_pd(ptrA, t1);
784 _mm_storeu_pd(ptrA+2, t2);
785 _mm_storeu_pd(ptrA+4, t3);
786 _mm_storeu_pd(ptrA+6, t4);
787 _mm_storeu_pd(ptrA+8, t5);
788 _mm_storeu_pd(ptrA+10, t6);
789 _mm_storeu_pd(ptrB, t7);
790 _mm_storeu_pd(ptrB+2, t8);
791 _mm_storeu_pd(ptrB+4, t9);
792 _mm_storeu_pd(ptrB+6, t10);
793 _mm_storeu_pd(ptrB+8, t11);
794 _mm_storeu_pd(ptrB+10, t12);
799 static gmx_inline void
800 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
801 double * gmx_restrict fptr,
802 double * gmx_restrict fshiftptr)
804 fix1 = _mm_hadd_pd(fix1, fiy1);
805 fiz1 = _mm_hadd_pd(fiz1, fiz1);
807 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
808 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
810 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
811 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
814 #if defined (_MSC_VER) && defined(_M_IX86)
815 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
816 #define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
820 fix1 = _mm_hadd_pd(fix1, fiy1); \
821 fiz1 = _mm_hadd_pd(fiz1, fix2); \
822 fiy2 = _mm_hadd_pd(fiy2, fiz2); \
823 fix3 = _mm_hadd_pd(fix3, fiy3); \
824 fiz3 = _mm_hadd_pd(fiz3, fiz3); \
825 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
826 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
827 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
828 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
829 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
830 fix1 = _mm_add_pd(fix1, fix3); \
831 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
832 fix1 = _mm_add_pd(fix1, _t1); \
833 _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
834 fiz1 = _mm_add_sd(fiz1, fiz3); \
835 fiz1 = _mm_add_sd(fiz1, _t2); \
836 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
837 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
840 /* Real function for sane compilers */
841 static gmx_inline void
842 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
843 __m128d fix2, __m128d fiy2, __m128d fiz2,
844 __m128d fix3, __m128d fiy3, __m128d fiz3,
845 double * gmx_restrict fptr,
846 double * gmx_restrict fshiftptr)
850 fix1 = _mm_hadd_pd(fix1, fiy1);
851 fiz1 = _mm_hadd_pd(fiz1, fix2);
852 fiy2 = _mm_hadd_pd(fiy2, fiz2);
853 fix3 = _mm_hadd_pd(fix3, fiy3);
854 fiz3 = _mm_hadd_pd(fiz3, fiz3);
856 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
857 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
858 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
859 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
860 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
862 fix1 = _mm_add_pd(fix1, fix3);
863 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
864 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
866 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
867 fiz1 = _mm_add_sd(fiz1, fiz3);
868 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
870 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
871 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
875 #if defined (_MSC_VER) && defined(_M_IX86)
876 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
877 #define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
881 fix1 = _mm_hadd_pd(fix1, fiy1); \
882 fiz1 = _mm_hadd_pd(fiz1, fix2); \
883 fiy2 = _mm_hadd_pd(fiy2, fiz2); \
884 fix3 = _mm_hadd_pd(fix3, fiy3); \
885 fiz3 = _mm_hadd_pd(fiz3, fix4); \
886 fiy4 = _mm_hadd_pd(fiy4, fiz4); \
887 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
888 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
889 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
890 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
891 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
892 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
893 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
894 fix1 = _mm_add_pd(fix1, _t1); \
895 _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
896 fix3 = _mm_add_pd(fix3, _t2); \
897 fix1 = _mm_add_pd(fix1, fix3); \
898 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
899 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
900 fiz1 = _mm_add_sd(fiz1, fiz3); \
901 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
902 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
905 /* Real function for sane compilers */
906 static gmx_inline void
907 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
908 __m128d fix2, __m128d fiy2, __m128d fiz2,
909 __m128d fix3, __m128d fiy3, __m128d fiz3,
910 __m128d fix4, __m128d fiy4, __m128d fiz4,
911 double * gmx_restrict fptr,
912 double * gmx_restrict fshiftptr)
916 fix1 = _mm_hadd_pd(fix1, fiy1);
917 fiz1 = _mm_hadd_pd(fiz1, fix2);
918 fiy2 = _mm_hadd_pd(fiy2, fiz2);
919 fix3 = _mm_hadd_pd(fix3, fiy3);
920 fiz3 = _mm_hadd_pd(fiz3, fix4);
921 fiy4 = _mm_hadd_pd(fiy4, fiz4);
923 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
924 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
925 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
926 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
927 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
928 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
930 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
931 fix1 = _mm_add_pd(fix1, t1);
932 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
933 fix3 = _mm_add_pd(fix3, t2);
934 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
936 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
937 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
938 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
940 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
941 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
946 static gmx_inline void
947 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
949 pot1 = _mm_hadd_pd(pot1, pot1);
950 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
953 static gmx_inline void
954 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
955 __m128d pot2, double * gmx_restrict ptrB)
957 pot1 = _mm_hadd_pd(pot1, pot2);
958 pot2 = _mm_unpackhi_pd(pot1, pot1);
960 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
961 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
965 #endif /* _kernelutil_x86_avx_128_fma_double_h_ */