2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_sse4_1_double_h_
36 #define _kernelutil_x86_sse4_1_double_h_
42 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
43 #define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
45 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
46 __m128d __gmx_t1 = row0; \
47 row0 = _mm_unpacklo_pd(row0, row1); \
48 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
51 /* Normal sum of four ymm registers */
52 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
55 gmx_mm_any_lt(__m128d a, __m128d b)
57 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
60 static gmx_inline __m128d
61 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
63 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
67 /* Load a double value from 1-2 places, merge into xmm register */
68 static gmx_inline __m128d
69 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
70 const double * gmx_restrict ptrB)
72 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
75 static gmx_inline __m128d
76 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
78 return _mm_load_sd(ptrA);
82 static gmx_inline void
83 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
84 double * gmx_restrict ptrB,
89 t2 = _mm_unpackhi_pd(xmm1, xmm1);
90 _mm_store_sd(ptrA, xmm1);
91 _mm_store_sd(ptrB, t2);
94 static gmx_inline void
95 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
97 _mm_store_sd(ptrA, xmm1);
101 /* Similar to store, but increments value in memory */
102 static gmx_inline void
103 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
104 double * gmx_restrict ptrB, __m128d xmm1)
108 t1 = _mm_unpackhi_pd(xmm1, xmm1);
109 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
110 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
111 _mm_store_sd(ptrA, xmm1);
112 _mm_store_sd(ptrB, t1);
115 static gmx_inline void
116 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
120 tmp = gmx_mm_load_1real_pd(ptrA);
121 tmp = _mm_add_sd(tmp, xmm1);
122 gmx_mm_store_1real_pd(ptrA, tmp);
126 static gmx_inline void
127 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
128 const double * gmx_restrict p2,
129 __m128d * gmx_restrict c6,
130 __m128d * gmx_restrict c12)
134 t1 = _mm_loadu_pd(p1);
135 t2 = _mm_loadu_pd(p2);
136 *c6 = _mm_unpacklo_pd(t1, t2);
137 *c12 = _mm_unpackhi_pd(t1, t2);
140 static gmx_inline void
141 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
142 __m128d * gmx_restrict c6,
143 __m128d * gmx_restrict c12)
145 *c6 = _mm_load_sd(p1);
146 *c12 = _mm_load_sd(p1+1);
151 static gmx_inline void
152 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
153 const double * gmx_restrict xyz,
154 __m128d * gmx_restrict x1,
155 __m128d * gmx_restrict y1,
156 __m128d * gmx_restrict z1)
158 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
160 mem_xy = _mm_loadu_pd(xyz);
161 mem_z = _mm_load_sd(xyz+2);
162 mem_sxy = _mm_loadu_pd(xyz_shift);
163 mem_sz = _mm_load_sd(xyz_shift+2);
165 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
166 mem_z = _mm_add_pd(mem_z, mem_sz);
168 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
169 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
170 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
174 static gmx_inline void
175 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
176 const double * gmx_restrict xyz,
177 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
178 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
179 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
181 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
183 t1 = _mm_loadu_pd(xyz);
184 t2 = _mm_loadu_pd(xyz+2);
185 t3 = _mm_loadu_pd(xyz+4);
186 t4 = _mm_loadu_pd(xyz+6);
187 t5 = _mm_load_sd(xyz+8);
189 sxy = _mm_loadu_pd(xyz_shift);
190 sz = _mm_load_sd(xyz_shift+2);
191 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
192 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
194 t1 = _mm_add_pd(t1, sxy);
195 t2 = _mm_add_pd(t2, szx);
196 t3 = _mm_add_pd(t3, syz);
197 t4 = _mm_add_pd(t4, sxy);
198 t5 = _mm_add_sd(t5, sz);
200 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
201 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
202 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
203 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
204 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
205 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
206 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
207 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
208 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
212 static gmx_inline void
213 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
214 const double * gmx_restrict xyz,
215 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
216 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
217 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
218 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
220 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
222 t1 = _mm_loadu_pd(xyz);
223 t2 = _mm_loadu_pd(xyz+2);
224 t3 = _mm_loadu_pd(xyz+4);
225 t4 = _mm_loadu_pd(xyz+6);
226 t5 = _mm_loadu_pd(xyz+8);
227 t6 = _mm_loadu_pd(xyz+10);
229 sxy = _mm_loadu_pd(xyz_shift);
230 sz = _mm_load_sd(xyz_shift+2);
231 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
232 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
234 t1 = _mm_add_pd(t1, sxy);
235 t2 = _mm_add_pd(t2, szx);
236 t3 = _mm_add_pd(t3, syz);
237 t4 = _mm_add_pd(t4, sxy);
238 t5 = _mm_add_pd(t5, szx);
239 t6 = _mm_add_pd(t6, syz);
241 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
242 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
243 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
244 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
245 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
246 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
247 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
248 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
249 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
250 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
251 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
252 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
258 static gmx_inline void
259 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
260 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
262 *x = _mm_load_sd(p1);
263 *y = _mm_load_sd(p1+1);
264 *z = _mm_load_sd(p1+2);
267 static gmx_inline void
268 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
269 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
270 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
271 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
273 *x1 = _mm_load_sd(p1);
274 *y1 = _mm_load_sd(p1+1);
275 *z1 = _mm_load_sd(p1+2);
276 *x2 = _mm_load_sd(p1+3);
277 *y2 = _mm_load_sd(p1+4);
278 *z2 = _mm_load_sd(p1+5);
279 *x3 = _mm_load_sd(p1+6);
280 *y3 = _mm_load_sd(p1+7);
281 *z3 = _mm_load_sd(p1+8);
284 static gmx_inline void
285 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
286 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
287 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
288 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
289 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
291 *x1 = _mm_load_sd(p1);
292 *y1 = _mm_load_sd(p1+1);
293 *z1 = _mm_load_sd(p1+2);
294 *x2 = _mm_load_sd(p1+3);
295 *y2 = _mm_load_sd(p1+4);
296 *z2 = _mm_load_sd(p1+5);
297 *x3 = _mm_load_sd(p1+6);
298 *y3 = _mm_load_sd(p1+7);
299 *z3 = _mm_load_sd(p1+8);
300 *x4 = _mm_load_sd(p1+9);
301 *y4 = _mm_load_sd(p1+10);
302 *z4 = _mm_load_sd(p1+11);
306 static gmx_inline void
307 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
308 const double * gmx_restrict ptrB,
309 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
311 __m128d t1, t2, t3, t4;
312 t1 = _mm_loadu_pd(ptrA);
313 t2 = _mm_loadu_pd(ptrB);
314 t3 = _mm_load_sd(ptrA+2);
315 t4 = _mm_load_sd(ptrB+2);
316 GMX_MM_TRANSPOSE2_PD(t1, t2);
319 *z1 = _mm_unpacklo_pd(t3, t4);
323 static gmx_inline void
324 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
325 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
326 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
327 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
329 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
330 t1 = _mm_loadu_pd(ptrA);
331 t2 = _mm_loadu_pd(ptrB);
332 t3 = _mm_loadu_pd(ptrA+2);
333 t4 = _mm_loadu_pd(ptrB+2);
334 t5 = _mm_loadu_pd(ptrA+4);
335 t6 = _mm_loadu_pd(ptrB+4);
336 t7 = _mm_loadu_pd(ptrA+6);
337 t8 = _mm_loadu_pd(ptrB+6);
338 t9 = _mm_load_sd(ptrA+8);
339 t10 = _mm_load_sd(ptrB+8);
340 GMX_MM_TRANSPOSE2_PD(t1, t2);
341 GMX_MM_TRANSPOSE2_PD(t3, t4);
342 GMX_MM_TRANSPOSE2_PD(t5, t6);
343 GMX_MM_TRANSPOSE2_PD(t7, t8);
352 *z3 = _mm_unpacklo_pd(t9, t10);
356 static gmx_inline void
357 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
358 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
359 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
360 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
361 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
363 __m128d t1, t2, t3, t4, t5, t6;
364 t1 = _mm_loadu_pd(ptrA);
365 t2 = _mm_loadu_pd(ptrB);
366 t3 = _mm_loadu_pd(ptrA+2);
367 t4 = _mm_loadu_pd(ptrB+2);
368 t5 = _mm_loadu_pd(ptrA+4);
369 t6 = _mm_loadu_pd(ptrB+4);
370 GMX_MM_TRANSPOSE2_PD(t1, t2);
371 GMX_MM_TRANSPOSE2_PD(t3, t4);
372 GMX_MM_TRANSPOSE2_PD(t5, t6);
379 t1 = _mm_loadu_pd(ptrA+6);
380 t2 = _mm_loadu_pd(ptrB+6);
381 t3 = _mm_loadu_pd(ptrA+8);
382 t4 = _mm_loadu_pd(ptrB+8);
383 t5 = _mm_loadu_pd(ptrA+10);
384 t6 = _mm_loadu_pd(ptrB+10);
385 GMX_MM_TRANSPOSE2_PD(t1, t2);
386 GMX_MM_TRANSPOSE2_PD(t3, t4);
387 GMX_MM_TRANSPOSE2_PD(t5, t6);
397 /* Routines to decrement rvec in memory, typically use for j particle force updates */
398 static gmx_inline void
399 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
400 __m128d xy, __m128d z)
404 t1 = _mm_loadu_pd(ptrA);
405 t2 = _mm_load_sd(ptrA+2);
407 t1 = _mm_sub_pd(t1, xy);
408 t2 = _mm_sub_sd(t2, z);
410 _mm_storeu_pd(ptrA, t1);
411 _mm_store_sd(ptrA+2, t2);
415 static gmx_inline void
416 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
417 __m128d x1, __m128d y1, __m128d z1)
421 t1 = _mm_load_sd(ptrA);
422 t2 = _mm_load_sd(ptrA+1);
423 t3 = _mm_load_sd(ptrA+2);
425 t1 = _mm_sub_sd(t1, x1);
426 t2 = _mm_sub_sd(t2, y1);
427 t3 = _mm_sub_sd(t3, z1);
428 _mm_store_sd(ptrA, t1);
429 _mm_store_sd(ptrA+1, t2);
430 _mm_store_sd(ptrA+2, t3);
434 #if defined (_MSC_VER) && defined(_M_IX86)
435 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
436 #define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
438 __m128d _t1, _t2, _t3, _t4, _t5; \
439 _t1 = _mm_loadu_pd(ptrA); \
440 _t2 = _mm_loadu_pd(ptrA+2); \
441 _t3 = _mm_loadu_pd(ptrA+4); \
442 _t4 = _mm_loadu_pd(ptrA+6); \
443 _t5 = _mm_load_sd(ptrA+8); \
444 _x1 = _mm_unpacklo_pd(_x1, _y1); \
445 _z1 = _mm_unpacklo_pd(_z1, _x2); \
446 _y2 = _mm_unpacklo_pd(_y2, _z2); \
447 _x3 = _mm_unpacklo_pd(_x3, _y3); \
448 _t1 = _mm_sub_pd(_t1, _x1); \
449 _t2 = _mm_sub_pd(_t2, _z1); \
450 _t3 = _mm_sub_pd(_t3, _y2); \
451 _t4 = _mm_sub_pd(_t4, _x3); \
452 _t5 = _mm_sub_sd(_t5, _z3); \
453 _mm_storeu_pd(ptrA, _t1); \
454 _mm_storeu_pd(ptrA+2, _t2); \
455 _mm_storeu_pd(ptrA+4, _t3); \
456 _mm_storeu_pd(ptrA+6, _t4); \
457 _mm_store_sd(ptrA+8, _t5); \
460 /* Real function for sane compilers */
461 static gmx_inline void
462 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
463 __m128d x1, __m128d y1, __m128d z1,
464 __m128d x2, __m128d y2, __m128d z2,
465 __m128d x3, __m128d y3, __m128d z3)
467 __m128d t1, t2, t3, t4, t5;
469 t1 = _mm_loadu_pd(ptrA);
470 t2 = _mm_loadu_pd(ptrA+2);
471 t3 = _mm_loadu_pd(ptrA+4);
472 t4 = _mm_loadu_pd(ptrA+6);
473 t5 = _mm_load_sd(ptrA+8);
475 x1 = _mm_unpacklo_pd(x1, y1);
476 z1 = _mm_unpacklo_pd(z1, x2);
477 y2 = _mm_unpacklo_pd(y2, z2);
478 x3 = _mm_unpacklo_pd(x3, y3);
479 /* nothing to be done for z3 */
481 t1 = _mm_sub_pd(t1, x1);
482 t2 = _mm_sub_pd(t2, z1);
483 t3 = _mm_sub_pd(t3, y2);
484 t4 = _mm_sub_pd(t4, x3);
485 t5 = _mm_sub_sd(t5, z3);
486 _mm_storeu_pd(ptrA, t1);
487 _mm_storeu_pd(ptrA+2, t2);
488 _mm_storeu_pd(ptrA+4, t3);
489 _mm_storeu_pd(ptrA+6, t4);
490 _mm_store_sd(ptrA+8, t5);
495 #if defined (_MSC_VER) && defined(_M_IX86)
496 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
497 #define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
499 __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
500 _t1 = _mm_loadu_pd(ptrA); \
501 _t2 = _mm_loadu_pd(ptrA+2); \
502 _t3 = _mm_loadu_pd(ptrA+4); \
503 _t4 = _mm_loadu_pd(ptrA+6); \
504 _t5 = _mm_loadu_pd(ptrA+8); \
505 _t6 = _mm_loadu_pd(ptrA+10); \
506 _x1 = _mm_unpacklo_pd(_x1, _y1); \
507 _z1 = _mm_unpacklo_pd(_z1, _x2); \
508 _y2 = _mm_unpacklo_pd(_y2, _z2); \
509 _x3 = _mm_unpacklo_pd(_x3, _y3); \
510 _z3 = _mm_unpacklo_pd(_z3, _x4); \
511 _y4 = _mm_unpacklo_pd(_y4, _z4); \
512 _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
513 _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
514 _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
515 _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
516 _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
517 _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
520 /* Real function for sane compilers */
521 static gmx_inline void
522 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
523 __m128d x1, __m128d y1, __m128d z1,
524 __m128d x2, __m128d y2, __m128d z2,
525 __m128d x3, __m128d y3, __m128d z3,
526 __m128d x4, __m128d y4, __m128d z4)
528 __m128d t1, t2, t3, t4, t5, t6;
530 t1 = _mm_loadu_pd(ptrA);
531 t2 = _mm_loadu_pd(ptrA+2);
532 t3 = _mm_loadu_pd(ptrA+4);
533 t4 = _mm_loadu_pd(ptrA+6);
534 t5 = _mm_loadu_pd(ptrA+8);
535 t6 = _mm_loadu_pd(ptrA+10);
537 x1 = _mm_unpacklo_pd(x1, y1);
538 z1 = _mm_unpacklo_pd(z1, x2);
539 y2 = _mm_unpacklo_pd(y2, z2);
540 x3 = _mm_unpacklo_pd(x3, y3);
541 z3 = _mm_unpacklo_pd(z3, x4);
542 y4 = _mm_unpacklo_pd(y4, z4);
544 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
545 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
546 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
547 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
548 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
549 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
554 static gmx_inline void
555 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
556 __m128d x1, __m128d y1, __m128d z1)
558 __m128d t1, t2, t3, t4, t5, t6, t7;
560 t1 = _mm_loadu_pd(ptrA);
561 t2 = _mm_load_sd(ptrA+2);
562 t3 = _mm_loadu_pd(ptrB);
563 t4 = _mm_load_sd(ptrB+2);
565 t5 = _mm_unpacklo_pd(x1, y1);
566 t6 = _mm_unpackhi_pd(x1, y1);
567 t7 = _mm_unpackhi_pd(z1, z1);
569 t1 = _mm_sub_pd(t1, t5);
570 t2 = _mm_sub_sd(t2, z1);
572 t3 = _mm_sub_pd(t3, t6);
573 t4 = _mm_sub_sd(t4, t7);
575 _mm_storeu_pd(ptrA, t1);
576 _mm_store_sd(ptrA+2, t2);
577 _mm_storeu_pd(ptrB, t3);
578 _mm_store_sd(ptrB+2, t4);
581 #if defined (_MSC_VER) && defined(_M_IX86)
582 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
583 #define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
585 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
586 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
587 _t1 = _mm_loadu_pd(ptrA); \
588 _t2 = _mm_loadu_pd(ptrA+2); \
589 _t3 = _mm_loadu_pd(ptrA+4); \
590 _t4 = _mm_loadu_pd(ptrA+6); \
591 _t5 = _mm_load_sd(ptrA+8); \
592 _t6 = _mm_loadu_pd(ptrB); \
593 _t7 = _mm_loadu_pd(ptrB+2); \
594 _t8 = _mm_loadu_pd(ptrB+4); \
595 _t9 = _mm_loadu_pd(ptrB+6); \
596 _t10 = _mm_load_sd(ptrB+8); \
597 _tA = _mm_unpacklo_pd(_x1, _y1); \
598 _tB = _mm_unpackhi_pd(_x1, _y1); \
599 _tC = _mm_unpacklo_pd(_z1, _x2); \
600 _tD = _mm_unpackhi_pd(_z1, _x2); \
601 _tE = _mm_unpacklo_pd(_y2, _z2); \
602 _tF = _mm_unpackhi_pd(_y2, _z2); \
603 _tG = _mm_unpacklo_pd(_x3, _y3); \
604 _tH = _mm_unpackhi_pd(_x3, _y3); \
605 _tI = _mm_unpackhi_pd(_z3, _z3); \
606 _t1 = _mm_sub_pd(_t1, _tA); \
607 _t2 = _mm_sub_pd(_t2, _tC); \
608 _t3 = _mm_sub_pd(_t3, _tE); \
609 _t4 = _mm_sub_pd(_t4, _tG); \
610 _t5 = _mm_sub_sd(_t5, _z3); \
611 _t6 = _mm_sub_pd(_t6, _tB); \
612 _t7 = _mm_sub_pd(_t7, _tD); \
613 _t8 = _mm_sub_pd(_t8, _tF); \
614 _t9 = _mm_sub_pd(_t9, _tH); \
615 _t10 = _mm_sub_sd(_t10, _tI); \
616 _mm_storeu_pd(ptrA, _t1); \
617 _mm_storeu_pd(ptrA+2, _t2); \
618 _mm_storeu_pd(ptrA+4, _t3); \
619 _mm_storeu_pd(ptrA+6, _t4); \
620 _mm_store_sd(ptrA+8, _t5); \
621 _mm_storeu_pd(ptrB, _t6); \
622 _mm_storeu_pd(ptrB+2, _t7); \
623 _mm_storeu_pd(ptrB+4, _t8); \
624 _mm_storeu_pd(ptrB+6, _t9); \
625 _mm_store_sd(ptrB+8, _t10); \
628 /* Real function for sane compilers */
629 static gmx_inline void
630 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
631 __m128d x1, __m128d y1, __m128d z1,
632 __m128d x2, __m128d y2, __m128d z2,
633 __m128d x3, __m128d y3, __m128d z3)
635 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
636 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
638 t1 = _mm_loadu_pd(ptrA);
639 t2 = _mm_loadu_pd(ptrA+2);
640 t3 = _mm_loadu_pd(ptrA+4);
641 t4 = _mm_loadu_pd(ptrA+6);
642 t5 = _mm_load_sd(ptrA+8);
643 t6 = _mm_loadu_pd(ptrB);
644 t7 = _mm_loadu_pd(ptrB+2);
645 t8 = _mm_loadu_pd(ptrB+4);
646 t9 = _mm_loadu_pd(ptrB+6);
647 t10 = _mm_load_sd(ptrB+8);
649 tA = _mm_unpacklo_pd(x1, y1);
650 tB = _mm_unpackhi_pd(x1, y1);
651 tC = _mm_unpacklo_pd(z1, x2);
652 tD = _mm_unpackhi_pd(z1, x2);
653 tE = _mm_unpacklo_pd(y2, z2);
654 tF = _mm_unpackhi_pd(y2, z2);
655 tG = _mm_unpacklo_pd(x3, y3);
656 tH = _mm_unpackhi_pd(x3, y3);
657 tI = _mm_unpackhi_pd(z3, z3);
659 t1 = _mm_sub_pd(t1, tA);
660 t2 = _mm_sub_pd(t2, tC);
661 t3 = _mm_sub_pd(t3, tE);
662 t4 = _mm_sub_pd(t4, tG);
663 t5 = _mm_sub_sd(t5, z3);
665 t6 = _mm_sub_pd(t6, tB);
666 t7 = _mm_sub_pd(t7, tD);
667 t8 = _mm_sub_pd(t8, tF);
668 t9 = _mm_sub_pd(t9, tH);
669 t10 = _mm_sub_sd(t10, tI);
671 _mm_storeu_pd(ptrA, t1);
672 _mm_storeu_pd(ptrA+2, t2);
673 _mm_storeu_pd(ptrA+4, t3);
674 _mm_storeu_pd(ptrA+6, t4);
675 _mm_store_sd(ptrA+8, t5);
676 _mm_storeu_pd(ptrB, t6);
677 _mm_storeu_pd(ptrB+2, t7);
678 _mm_storeu_pd(ptrB+4, t8);
679 _mm_storeu_pd(ptrB+6, t9);
680 _mm_store_sd(ptrB+8, t10);
685 #if defined (_MSC_VER) && defined(_M_IX86)
686 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
687 #define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
689 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
690 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
691 _t1 = _mm_loadu_pd(ptrA); \
692 _t2 = _mm_loadu_pd(ptrA+2); \
693 _t3 = _mm_loadu_pd(ptrA+4); \
694 _t4 = _mm_loadu_pd(ptrA+6); \
695 _t5 = _mm_loadu_pd(ptrA+8); \
696 _t6 = _mm_loadu_pd(ptrA+10); \
697 _t7 = _mm_loadu_pd(ptrB); \
698 _t8 = _mm_loadu_pd(ptrB+2); \
699 _t9 = _mm_loadu_pd(ptrB+4); \
700 _t10 = _mm_loadu_pd(ptrB+6); \
701 _t11 = _mm_loadu_pd(ptrB+8); \
702 _t12 = _mm_loadu_pd(ptrB+10); \
703 _tA = _mm_unpacklo_pd(_x1, _y1); \
704 _tB = _mm_unpackhi_pd(_x1, _y1); \
705 _tC = _mm_unpacklo_pd(_z1, _x2); \
706 _tD = _mm_unpackhi_pd(_z1, _x2); \
707 _tE = _mm_unpacklo_pd(_y2, _z2); \
708 _tF = _mm_unpackhi_pd(_y2, _z2); \
709 _tG = _mm_unpacklo_pd(_x3, _y3); \
710 _tH = _mm_unpackhi_pd(_x3, _y3); \
711 _tI = _mm_unpacklo_pd(_z3, _x4); \
712 _tJ = _mm_unpackhi_pd(_z3, _x4); \
713 _tK = _mm_unpacklo_pd(_y4, _z4); \
714 _tL = _mm_unpackhi_pd(_y4, _z4); \
715 _t1 = _mm_sub_pd(_t1, _tA); \
716 _t2 = _mm_sub_pd(_t2, _tC); \
717 _t3 = _mm_sub_pd(_t3, _tE); \
718 _t4 = _mm_sub_pd(_t4, _tG); \
719 _t5 = _mm_sub_pd(_t5, _tI); \
720 _t6 = _mm_sub_pd(_t6, _tK); \
721 _t7 = _mm_sub_pd(_t7, _tB); \
722 _t8 = _mm_sub_pd(_t8, _tD); \
723 _t9 = _mm_sub_pd(_t9, _tF); \
724 _t10 = _mm_sub_pd(_t10, _tH); \
725 _t11 = _mm_sub_pd(_t11, _tJ); \
726 _t12 = _mm_sub_pd(_t12, _tL); \
727 _mm_storeu_pd(ptrA, _t1); \
728 _mm_storeu_pd(ptrA+2, _t2); \
729 _mm_storeu_pd(ptrA+4, _t3); \
730 _mm_storeu_pd(ptrA+6, _t4); \
731 _mm_storeu_pd(ptrA+8, _t5); \
732 _mm_storeu_pd(ptrA+10, _t6); \
733 _mm_storeu_pd(ptrB, _t7); \
734 _mm_storeu_pd(ptrB+2, _t8); \
735 _mm_storeu_pd(ptrB+4, _t9); \
736 _mm_storeu_pd(ptrB+6, _t10); \
737 _mm_storeu_pd(ptrB+8, _t11); \
738 _mm_storeu_pd(ptrB+10, _t12); \
741 /* Real function for sane compilers */
742 static gmx_inline void
743 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
744 __m128d x1, __m128d y1, __m128d z1,
745 __m128d x2, __m128d y2, __m128d z2,
746 __m128d x3, __m128d y3, __m128d z3,
747 __m128d x4, __m128d y4, __m128d z4)
749 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
750 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
752 t1 = _mm_loadu_pd(ptrA);
753 t2 = _mm_loadu_pd(ptrA+2);
754 t3 = _mm_loadu_pd(ptrA+4);
755 t4 = _mm_loadu_pd(ptrA+6);
756 t5 = _mm_loadu_pd(ptrA+8);
757 t6 = _mm_loadu_pd(ptrA+10);
758 t7 = _mm_loadu_pd(ptrB);
759 t8 = _mm_loadu_pd(ptrB+2);
760 t9 = _mm_loadu_pd(ptrB+4);
761 t10 = _mm_loadu_pd(ptrB+6);
762 t11 = _mm_loadu_pd(ptrB+8);
763 t12 = _mm_loadu_pd(ptrB+10);
765 tA = _mm_unpacklo_pd(x1, y1);
766 tB = _mm_unpackhi_pd(x1, y1);
767 tC = _mm_unpacklo_pd(z1, x2);
768 tD = _mm_unpackhi_pd(z1, x2);
769 tE = _mm_unpacklo_pd(y2, z2);
770 tF = _mm_unpackhi_pd(y2, z2);
771 tG = _mm_unpacklo_pd(x3, y3);
772 tH = _mm_unpackhi_pd(x3, y3);
773 tI = _mm_unpacklo_pd(z3, x4);
774 tJ = _mm_unpackhi_pd(z3, x4);
775 tK = _mm_unpacklo_pd(y4, z4);
776 tL = _mm_unpackhi_pd(y4, z4);
778 t1 = _mm_sub_pd(t1, tA);
779 t2 = _mm_sub_pd(t2, tC);
780 t3 = _mm_sub_pd(t3, tE);
781 t4 = _mm_sub_pd(t4, tG);
782 t5 = _mm_sub_pd(t5, tI);
783 t6 = _mm_sub_pd(t6, tK);
785 t7 = _mm_sub_pd(t7, tB);
786 t8 = _mm_sub_pd(t8, tD);
787 t9 = _mm_sub_pd(t9, tF);
788 t10 = _mm_sub_pd(t10, tH);
789 t11 = _mm_sub_pd(t11, tJ);
790 t12 = _mm_sub_pd(t12, tL);
792 _mm_storeu_pd(ptrA, t1);
793 _mm_storeu_pd(ptrA+2, t2);
794 _mm_storeu_pd(ptrA+4, t3);
795 _mm_storeu_pd(ptrA+6, t4);
796 _mm_storeu_pd(ptrA+8, t5);
797 _mm_storeu_pd(ptrA+10, t6);
798 _mm_storeu_pd(ptrB, t7);
799 _mm_storeu_pd(ptrB+2, t8);
800 _mm_storeu_pd(ptrB+4, t9);
801 _mm_storeu_pd(ptrB+6, t10);
802 _mm_storeu_pd(ptrB+8, t11);
803 _mm_storeu_pd(ptrB+10, t12);
810 static gmx_inline void
811 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
812 double * gmx_restrict fptr,
813 double * gmx_restrict fshiftptr)
815 fix1 = _mm_hadd_pd(fix1, fiy1);
816 fiz1 = _mm_hadd_pd(fiz1, fiz1);
818 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
819 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
821 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
822 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
826 #if defined (_MSC_VER) && defined(_M_IX86)
827 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
828 #define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
832 fix1 = _mm_hadd_pd(fix1, fiy1); \
833 fiz1 = _mm_hadd_pd(fiz1, fix2); \
834 fiy2 = _mm_hadd_pd(fiy2, fiz2); \
835 fix3 = _mm_hadd_pd(fix3, fiy3); \
836 fiz3 = _mm_hadd_pd(fiz3, fiz3); \
837 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
838 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
839 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
840 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
841 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
842 fix1 = _mm_add_pd(fix1, fix3); \
843 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
844 fix1 = _mm_add_pd(fix1, _t1); \
845 _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
846 fiz1 = _mm_add_sd(fiz1, fiz3); \
847 fiz1 = _mm_add_sd(fiz1, _t2); \
848 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
849 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
852 /* Real function for sane compilers */
853 static gmx_inline void
854 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
855 __m128d fix2, __m128d fiy2, __m128d fiz2,
856 __m128d fix3, __m128d fiy3, __m128d fiz3,
857 double * gmx_restrict fptr,
858 double * gmx_restrict fshiftptr)
862 fix1 = _mm_hadd_pd(fix1, fiy1);
863 fiz1 = _mm_hadd_pd(fiz1, fix2);
864 fiy2 = _mm_hadd_pd(fiy2, fiz2);
865 fix3 = _mm_hadd_pd(fix3, fiy3);
866 fiz3 = _mm_hadd_pd(fiz3, fiz3);
868 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
869 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
870 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
871 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
872 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
874 fix1 = _mm_add_pd(fix1, fix3);
875 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
876 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
878 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
879 fiz1 = _mm_add_sd(fiz1, fiz3);
880 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
882 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
883 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
887 #if defined (_MSC_VER) && defined(_M_IX86)
888 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
889 #define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
893 fix1 = _mm_hadd_pd(fix1, fiy1); \
894 fiz1 = _mm_hadd_pd(fiz1, fix2); \
895 fiy2 = _mm_hadd_pd(fiy2, fiz2); \
896 fix3 = _mm_hadd_pd(fix3, fiy3); \
897 fiz3 = _mm_hadd_pd(fiz3, fix4); \
898 fiy4 = _mm_hadd_pd(fiy4, fiz4); \
899 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
900 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
901 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
902 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
903 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
904 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
905 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
906 fix1 = _mm_add_pd(fix1, _t1); \
907 _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
908 fix3 = _mm_add_pd(fix3, _t2); \
909 fix1 = _mm_add_pd(fix1, fix3); \
910 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
911 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
912 fiz1 = _mm_add_sd(fiz1, fiz3); \
913 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
914 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
917 /* Real function for sane compilers */
918 static gmx_inline void
919 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
920 __m128d fix2, __m128d fiy2, __m128d fiz2,
921 __m128d fix3, __m128d fiy3, __m128d fiz3,
922 __m128d fix4, __m128d fiy4, __m128d fiz4,
923 double * gmx_restrict fptr,
924 double * gmx_restrict fshiftptr)
928 fix1 = _mm_hadd_pd(fix1, fiy1);
929 fiz1 = _mm_hadd_pd(fiz1, fix2);
930 fiy2 = _mm_hadd_pd(fiy2, fiz2);
931 fix3 = _mm_hadd_pd(fix3, fiy3);
932 fiz3 = _mm_hadd_pd(fiz3, fix4);
933 fiy4 = _mm_hadd_pd(fiy4, fiz4);
935 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
936 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
937 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
938 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
939 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
940 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
942 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
943 fix1 = _mm_add_pd(fix1, t1);
944 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
945 fix3 = _mm_add_pd(fix3, t2);
946 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
948 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
949 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
950 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
952 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
953 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
957 static gmx_inline void
958 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
960 pot1 = _mm_hadd_pd(pot1, pot1);
961 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
964 static gmx_inline void
965 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
966 __m128d pot2, double * gmx_restrict ptrB)
968 pot1 = _mm_hadd_pd(pot1, pot2);
969 pot2 = _mm_unpackhi_pd(pot1, pot1);
971 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
972 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
976 #endif /* _kernelutil_x86_sse4_1_double_h_ */