2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_sse2_double_h_
36 #define _kernelutil_x86_sse2_double_h_
40 #include "gromacs/simd/general_x86_sse2.h"
45 /* Normal sum of four ymm registers */
46 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
49 gmx_mm_any_lt(__m128d a, __m128d b)
51 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
54 static gmx_inline __m128d
55 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
57 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
61 /* Load a double value from 1-2 places, merge into xmm register */
62 static gmx_inline __m128d
63 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
64 const double * gmx_restrict ptrB)
66 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
69 static gmx_inline __m128d
70 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
72 return _mm_load_sd(ptrA);
76 static gmx_inline void
77 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
78 double * gmx_restrict ptrB,
83 t2 = _mm_unpackhi_pd(xmm1, xmm1);
84 _mm_store_sd(ptrA, xmm1);
85 _mm_store_sd(ptrB, t2);
88 static gmx_inline void
89 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
91 _mm_store_sd(ptrA, xmm1);
95 /* Similar to store, but increments value in memory */
96 static gmx_inline void
97 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
98 double * gmx_restrict ptrB, __m128d xmm1)
102 t1 = _mm_unpackhi_pd(xmm1, xmm1);
103 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
104 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
105 _mm_store_sd(ptrA, xmm1);
106 _mm_store_sd(ptrB, t1);
109 static gmx_inline void
110 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
114 tmp = gmx_mm_load_1real_pd(ptrA);
115 tmp = _mm_add_sd(tmp, xmm1);
116 gmx_mm_store_1real_pd(ptrA, tmp);
120 static gmx_inline void
121 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
122 const double * gmx_restrict p2,
123 __m128d * gmx_restrict c6,
124 __m128d * gmx_restrict c12)
128 t1 = _mm_loadu_pd(p1);
129 t2 = _mm_loadu_pd(p2);
130 *c6 = _mm_unpacklo_pd(t1, t2);
131 *c12 = _mm_unpackhi_pd(t1, t2);
134 static gmx_inline void
135 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
136 __m128d * gmx_restrict c6,
137 __m128d * gmx_restrict c12)
139 *c6 = _mm_load_sd(p1);
140 *c12 = _mm_load_sd(p1+1);
145 static gmx_inline void
146 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
147 const double * gmx_restrict xyz,
148 __m128d * gmx_restrict x1,
149 __m128d * gmx_restrict y1,
150 __m128d * gmx_restrict z1)
152 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
154 mem_xy = _mm_loadu_pd(xyz);
155 mem_z = _mm_load_sd(xyz+2);
156 mem_sxy = _mm_loadu_pd(xyz_shift);
157 mem_sz = _mm_load_sd(xyz_shift+2);
159 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
160 mem_z = _mm_add_pd(mem_z, mem_sz);
162 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
163 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
164 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
168 static gmx_inline void
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
170 const double * gmx_restrict xyz,
171 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
172 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
173 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
175 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
177 t1 = _mm_loadu_pd(xyz);
178 t2 = _mm_loadu_pd(xyz+2);
179 t3 = _mm_loadu_pd(xyz+4);
180 t4 = _mm_loadu_pd(xyz+6);
181 t5 = _mm_load_sd(xyz+8);
183 sxy = _mm_loadu_pd(xyz_shift);
184 sz = _mm_load_sd(xyz_shift+2);
185 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
186 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
188 t1 = _mm_add_pd(t1, sxy);
189 t2 = _mm_add_pd(t2, szx);
190 t3 = _mm_add_pd(t3, syz);
191 t4 = _mm_add_pd(t4, sxy);
192 t5 = _mm_add_sd(t5, sz);
194 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
195 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
196 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
197 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
198 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
199 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
200 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
201 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
202 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
206 static gmx_inline void
207 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
208 const double * gmx_restrict xyz,
209 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
210 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
211 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
212 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
214 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
216 t1 = _mm_loadu_pd(xyz);
217 t2 = _mm_loadu_pd(xyz+2);
218 t3 = _mm_loadu_pd(xyz+4);
219 t4 = _mm_loadu_pd(xyz+6);
220 t5 = _mm_loadu_pd(xyz+8);
221 t6 = _mm_loadu_pd(xyz+10);
223 sxy = _mm_loadu_pd(xyz_shift);
224 sz = _mm_load_sd(xyz_shift+2);
225 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
226 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
228 t1 = _mm_add_pd(t1, sxy);
229 t2 = _mm_add_pd(t2, szx);
230 t3 = _mm_add_pd(t3, syz);
231 t4 = _mm_add_pd(t4, sxy);
232 t5 = _mm_add_pd(t5, szx);
233 t6 = _mm_add_pd(t6, syz);
235 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
236 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
237 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
238 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
239 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
240 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
241 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
242 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
243 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
244 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
245 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
246 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
252 static gmx_inline void
253 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
254 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
256 *x = _mm_load_sd(p1);
257 *y = _mm_load_sd(p1+1);
258 *z = _mm_load_sd(p1+2);
261 static gmx_inline void
262 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
263 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
264 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
265 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
267 *x1 = _mm_load_sd(p1);
268 *y1 = _mm_load_sd(p1+1);
269 *z1 = _mm_load_sd(p1+2);
270 *x2 = _mm_load_sd(p1+3);
271 *y2 = _mm_load_sd(p1+4);
272 *z2 = _mm_load_sd(p1+5);
273 *x3 = _mm_load_sd(p1+6);
274 *y3 = _mm_load_sd(p1+7);
275 *z3 = _mm_load_sd(p1+8);
278 static gmx_inline void
279 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
280 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
281 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
282 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
283 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
285 *x1 = _mm_load_sd(p1);
286 *y1 = _mm_load_sd(p1+1);
287 *z1 = _mm_load_sd(p1+2);
288 *x2 = _mm_load_sd(p1+3);
289 *y2 = _mm_load_sd(p1+4);
290 *z2 = _mm_load_sd(p1+5);
291 *x3 = _mm_load_sd(p1+6);
292 *y3 = _mm_load_sd(p1+7);
293 *z3 = _mm_load_sd(p1+8);
294 *x4 = _mm_load_sd(p1+9);
295 *y4 = _mm_load_sd(p1+10);
296 *z4 = _mm_load_sd(p1+11);
300 static gmx_inline void
301 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
302 const double * gmx_restrict ptrB,
303 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
305 __m128d t1, t2, t3, t4;
306 t1 = _mm_loadu_pd(ptrA);
307 t2 = _mm_loadu_pd(ptrB);
308 t3 = _mm_load_sd(ptrA+2);
309 t4 = _mm_load_sd(ptrB+2);
310 GMX_MM_TRANSPOSE2_PD(t1, t2);
313 *z1 = _mm_unpacklo_pd(t3, t4);
317 static gmx_inline void
318 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
319 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
320 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
321 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
323 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
324 t1 = _mm_loadu_pd(ptrA);
325 t2 = _mm_loadu_pd(ptrB);
326 t3 = _mm_loadu_pd(ptrA+2);
327 t4 = _mm_loadu_pd(ptrB+2);
328 t5 = _mm_loadu_pd(ptrA+4);
329 t6 = _mm_loadu_pd(ptrB+4);
330 t7 = _mm_loadu_pd(ptrA+6);
331 t8 = _mm_loadu_pd(ptrB+6);
332 t9 = _mm_load_sd(ptrA+8);
333 t10 = _mm_load_sd(ptrB+8);
334 GMX_MM_TRANSPOSE2_PD(t1, t2);
335 GMX_MM_TRANSPOSE2_PD(t3, t4);
336 GMX_MM_TRANSPOSE2_PD(t5, t6);
337 GMX_MM_TRANSPOSE2_PD(t7, t8);
346 *z3 = _mm_unpacklo_pd(t9, t10);
350 static gmx_inline void
351 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
352 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
353 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
354 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
355 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
357 __m128d t1, t2, t3, t4, t5, t6;
358 t1 = _mm_loadu_pd(ptrA);
359 t2 = _mm_loadu_pd(ptrB);
360 t3 = _mm_loadu_pd(ptrA+2);
361 t4 = _mm_loadu_pd(ptrB+2);
362 t5 = _mm_loadu_pd(ptrA+4);
363 t6 = _mm_loadu_pd(ptrB+4);
364 GMX_MM_TRANSPOSE2_PD(t1, t2);
365 GMX_MM_TRANSPOSE2_PD(t3, t4);
366 GMX_MM_TRANSPOSE2_PD(t5, t6);
373 t1 = _mm_loadu_pd(ptrA+6);
374 t2 = _mm_loadu_pd(ptrB+6);
375 t3 = _mm_loadu_pd(ptrA+8);
376 t4 = _mm_loadu_pd(ptrB+8);
377 t5 = _mm_loadu_pd(ptrA+10);
378 t6 = _mm_loadu_pd(ptrB+10);
379 GMX_MM_TRANSPOSE2_PD(t1, t2);
380 GMX_MM_TRANSPOSE2_PD(t3, t4);
381 GMX_MM_TRANSPOSE2_PD(t5, t6);
391 /* Routines to decrement rvec in memory, typically use for j particle force updates */
392 static gmx_inline void
393 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
394 __m128d xy, __m128d z)
398 t1 = _mm_loadu_pd(ptrA);
399 t2 = _mm_load_sd(ptrA+2);
401 t1 = _mm_sub_pd(t1, xy);
402 t2 = _mm_sub_sd(t2, z);
404 _mm_storeu_pd(ptrA, t1);
405 _mm_store_sd(ptrA+2, t2);
409 static gmx_inline void
410 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
411 __m128d x1, __m128d y1, __m128d z1)
415 t1 = _mm_load_sd(ptrA);
416 t2 = _mm_load_sd(ptrA+1);
417 t3 = _mm_load_sd(ptrA+2);
419 t1 = _mm_sub_sd(t1, x1);
420 t2 = _mm_sub_sd(t2, y1);
421 t3 = _mm_sub_sd(t3, z1);
422 _mm_store_sd(ptrA, t1);
423 _mm_store_sd(ptrA+1, t2);
424 _mm_store_sd(ptrA+2, t3);
428 #if defined (_MSC_VER) && defined(_M_IX86)
429 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
430 #define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
432 __m128d _t1, _t2, _t3, _t4, _t5; \
433 _t1 = _mm_loadu_pd(ptrA); \
434 _t2 = _mm_loadu_pd(ptrA+2); \
435 _t3 = _mm_loadu_pd(ptrA+4); \
436 _t4 = _mm_loadu_pd(ptrA+6); \
437 _t5 = _mm_load_sd(ptrA+8); \
438 _x1 = _mm_unpacklo_pd(_x1, _y1); \
439 _z1 = _mm_unpacklo_pd(_z1, _x2); \
440 _y2 = _mm_unpacklo_pd(_y2, _z2); \
441 _x3 = _mm_unpacklo_pd(_x3, _y3); \
442 _t1 = _mm_sub_pd(_t1, _x1); \
443 _t2 = _mm_sub_pd(_t2, _z1); \
444 _t3 = _mm_sub_pd(_t3, _y2); \
445 _t4 = _mm_sub_pd(_t4, _x3); \
446 _t5 = _mm_sub_sd(_t5, _z3); \
447 _mm_storeu_pd(ptrA, _t1); \
448 _mm_storeu_pd(ptrA+2, _t2); \
449 _mm_storeu_pd(ptrA+4, _t3); \
450 _mm_storeu_pd(ptrA+6, _t4); \
451 _mm_store_sd(ptrA+8, _t5); \
454 /* Real function for sane compilers */
455 static gmx_inline void
456 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
457 __m128d x1, __m128d y1, __m128d z1,
458 __m128d x2, __m128d y2, __m128d z2,
459 __m128d x3, __m128d y3, __m128d z3)
461 __m128d t1, t2, t3, t4, t5;
463 t1 = _mm_loadu_pd(ptrA);
464 t2 = _mm_loadu_pd(ptrA+2);
465 t3 = _mm_loadu_pd(ptrA+4);
466 t4 = _mm_loadu_pd(ptrA+6);
467 t5 = _mm_load_sd(ptrA+8);
469 x1 = _mm_unpacklo_pd(x1, y1);
470 z1 = _mm_unpacklo_pd(z1, x2);
471 y2 = _mm_unpacklo_pd(y2, z2);
472 x3 = _mm_unpacklo_pd(x3, y3);
473 /* nothing to be done for z3 */
475 t1 = _mm_sub_pd(t1, x1);
476 t2 = _mm_sub_pd(t2, z1);
477 t3 = _mm_sub_pd(t3, y2);
478 t4 = _mm_sub_pd(t4, x3);
479 t5 = _mm_sub_sd(t5, z3);
480 _mm_storeu_pd(ptrA, t1);
481 _mm_storeu_pd(ptrA+2, t2);
482 _mm_storeu_pd(ptrA+4, t3);
483 _mm_storeu_pd(ptrA+6, t4);
484 _mm_store_sd(ptrA+8, t5);
489 #if defined (_MSC_VER) && defined(_M_IX86)
490 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
491 #define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
493 __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
494 _t1 = _mm_loadu_pd(ptrA); \
495 _t2 = _mm_loadu_pd(ptrA+2); \
496 _t3 = _mm_loadu_pd(ptrA+4); \
497 _t4 = _mm_loadu_pd(ptrA+6); \
498 _t5 = _mm_loadu_pd(ptrA+8); \
499 _t6 = _mm_loadu_pd(ptrA+10); \
500 _x1 = _mm_unpacklo_pd(_x1, _y1); \
501 _z1 = _mm_unpacklo_pd(_z1, _x2); \
502 _y2 = _mm_unpacklo_pd(_y2, _z2); \
503 _x3 = _mm_unpacklo_pd(_x3, _y3); \
504 _z3 = _mm_unpacklo_pd(_z3, _x4); \
505 _y4 = _mm_unpacklo_pd(_y4, _z4); \
506 _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
507 _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
508 _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
509 _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
510 _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
511 _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
514 /* Real function for sane compilers */
515 static gmx_inline void
516 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
517 __m128d x1, __m128d y1, __m128d z1,
518 __m128d x2, __m128d y2, __m128d z2,
519 __m128d x3, __m128d y3, __m128d z3,
520 __m128d x4, __m128d y4, __m128d z4)
522 __m128d t1, t2, t3, t4, t5, t6;
524 t1 = _mm_loadu_pd(ptrA);
525 t2 = _mm_loadu_pd(ptrA+2);
526 t3 = _mm_loadu_pd(ptrA+4);
527 t4 = _mm_loadu_pd(ptrA+6);
528 t5 = _mm_loadu_pd(ptrA+8);
529 t6 = _mm_loadu_pd(ptrA+10);
531 x1 = _mm_unpacklo_pd(x1, y1);
532 z1 = _mm_unpacklo_pd(z1, x2);
533 y2 = _mm_unpacklo_pd(y2, z2);
534 x3 = _mm_unpacklo_pd(x3, y3);
535 z3 = _mm_unpacklo_pd(z3, x4);
536 y4 = _mm_unpacklo_pd(y4, z4);
538 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
539 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
540 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
541 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
542 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
543 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
548 static gmx_inline void
549 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
550 __m128d x1, __m128d y1, __m128d z1)
552 __m128d t1, t2, t3, t4, t5, t6, t7;
554 t1 = _mm_loadu_pd(ptrA);
555 t2 = _mm_load_sd(ptrA+2);
556 t3 = _mm_loadu_pd(ptrB);
557 t4 = _mm_load_sd(ptrB+2);
559 t5 = _mm_unpacklo_pd(x1, y1);
560 t6 = _mm_unpackhi_pd(x1, y1);
561 t7 = _mm_unpackhi_pd(z1, z1);
563 t1 = _mm_sub_pd(t1, t5);
564 t2 = _mm_sub_sd(t2, z1);
566 t3 = _mm_sub_pd(t3, t6);
567 t4 = _mm_sub_sd(t4, t7);
569 _mm_storeu_pd(ptrA, t1);
570 _mm_store_sd(ptrA+2, t2);
571 _mm_storeu_pd(ptrB, t3);
572 _mm_store_sd(ptrB+2, t4);
575 #if defined (_MSC_VER) && defined(_M_IX86)
576 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
577 #define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
579 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
580 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
581 _t1 = _mm_loadu_pd(ptrA); \
582 _t2 = _mm_loadu_pd(ptrA+2); \
583 _t3 = _mm_loadu_pd(ptrA+4); \
584 _t4 = _mm_loadu_pd(ptrA+6); \
585 _t5 = _mm_load_sd(ptrA+8); \
586 _t6 = _mm_loadu_pd(ptrB); \
587 _t7 = _mm_loadu_pd(ptrB+2); \
588 _t8 = _mm_loadu_pd(ptrB+4); \
589 _t9 = _mm_loadu_pd(ptrB+6); \
590 _t10 = _mm_load_sd(ptrB+8); \
591 _tA = _mm_unpacklo_pd(_x1, _y1); \
592 _tB = _mm_unpackhi_pd(_x1, _y1); \
593 _tC = _mm_unpacklo_pd(_z1, _x2); \
594 _tD = _mm_unpackhi_pd(_z1, _x2); \
595 _tE = _mm_unpacklo_pd(_y2, _z2); \
596 _tF = _mm_unpackhi_pd(_y2, _z2); \
597 _tG = _mm_unpacklo_pd(_x3, _y3); \
598 _tH = _mm_unpackhi_pd(_x3, _y3); \
599 _tI = _mm_unpackhi_pd(_z3, _z3); \
600 _t1 = _mm_sub_pd(_t1, _tA); \
601 _t2 = _mm_sub_pd(_t2, _tC); \
602 _t3 = _mm_sub_pd(_t3, _tE); \
603 _t4 = _mm_sub_pd(_t4, _tG); \
604 _t5 = _mm_sub_sd(_t5, _z3); \
605 _t6 = _mm_sub_pd(_t6, _tB); \
606 _t7 = _mm_sub_pd(_t7, _tD); \
607 _t8 = _mm_sub_pd(_t8, _tF); \
608 _t9 = _mm_sub_pd(_t9, _tH); \
609 _t10 = _mm_sub_sd(_t10, _tI); \
610 _mm_storeu_pd(ptrA, _t1); \
611 _mm_storeu_pd(ptrA+2, _t2); \
612 _mm_storeu_pd(ptrA+4, _t3); \
613 _mm_storeu_pd(ptrA+6, _t4); \
614 _mm_store_sd(ptrA+8, _t5); \
615 _mm_storeu_pd(ptrB, _t6); \
616 _mm_storeu_pd(ptrB+2, _t7); \
617 _mm_storeu_pd(ptrB+4, _t8); \
618 _mm_storeu_pd(ptrB+6, _t9); \
619 _mm_store_sd(ptrB+8, _t10); \
622 /* Real function for sane compilers */
623 static gmx_inline void
624 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
625 __m128d x1, __m128d y1, __m128d z1,
626 __m128d x2, __m128d y2, __m128d z2,
627 __m128d x3, __m128d y3, __m128d z3)
629 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
630 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
632 t1 = _mm_loadu_pd(ptrA);
633 t2 = _mm_loadu_pd(ptrA+2);
634 t3 = _mm_loadu_pd(ptrA+4);
635 t4 = _mm_loadu_pd(ptrA+6);
636 t5 = _mm_load_sd(ptrA+8);
637 t6 = _mm_loadu_pd(ptrB);
638 t7 = _mm_loadu_pd(ptrB+2);
639 t8 = _mm_loadu_pd(ptrB+4);
640 t9 = _mm_loadu_pd(ptrB+6);
641 t10 = _mm_load_sd(ptrB+8);
643 tA = _mm_unpacklo_pd(x1, y1);
644 tB = _mm_unpackhi_pd(x1, y1);
645 tC = _mm_unpacklo_pd(z1, x2);
646 tD = _mm_unpackhi_pd(z1, x2);
647 tE = _mm_unpacklo_pd(y2, z2);
648 tF = _mm_unpackhi_pd(y2, z2);
649 tG = _mm_unpacklo_pd(x3, y3);
650 tH = _mm_unpackhi_pd(x3, y3);
651 tI = _mm_unpackhi_pd(z3, z3);
653 t1 = _mm_sub_pd(t1, tA);
654 t2 = _mm_sub_pd(t2, tC);
655 t3 = _mm_sub_pd(t3, tE);
656 t4 = _mm_sub_pd(t4, tG);
657 t5 = _mm_sub_sd(t5, z3);
659 t6 = _mm_sub_pd(t6, tB);
660 t7 = _mm_sub_pd(t7, tD);
661 t8 = _mm_sub_pd(t8, tF);
662 t9 = _mm_sub_pd(t9, tH);
663 t10 = _mm_sub_sd(t10, tI);
665 _mm_storeu_pd(ptrA, t1);
666 _mm_storeu_pd(ptrA+2, t2);
667 _mm_storeu_pd(ptrA+4, t3);
668 _mm_storeu_pd(ptrA+6, t4);
669 _mm_store_sd(ptrA+8, t5);
670 _mm_storeu_pd(ptrB, t6);
671 _mm_storeu_pd(ptrB+2, t7);
672 _mm_storeu_pd(ptrB+4, t8);
673 _mm_storeu_pd(ptrB+6, t9);
674 _mm_store_sd(ptrB+8, t10);
679 #if defined (_MSC_VER) && defined(_M_IX86)
680 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
681 #define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
683 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
684 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
685 _t1 = _mm_loadu_pd(ptrA); \
686 _t2 = _mm_loadu_pd(ptrA+2); \
687 _t3 = _mm_loadu_pd(ptrA+4); \
688 _t4 = _mm_loadu_pd(ptrA+6); \
689 _t5 = _mm_loadu_pd(ptrA+8); \
690 _t6 = _mm_loadu_pd(ptrA+10); \
691 _t7 = _mm_loadu_pd(ptrB); \
692 _t8 = _mm_loadu_pd(ptrB+2); \
693 _t9 = _mm_loadu_pd(ptrB+4); \
694 _t10 = _mm_loadu_pd(ptrB+6); \
695 _t11 = _mm_loadu_pd(ptrB+8); \
696 _t12 = _mm_loadu_pd(ptrB+10); \
697 _tA = _mm_unpacklo_pd(_x1, _y1); \
698 _tB = _mm_unpackhi_pd(_x1, _y1); \
699 _tC = _mm_unpacklo_pd(_z1, _x2); \
700 _tD = _mm_unpackhi_pd(_z1, _x2); \
701 _tE = _mm_unpacklo_pd(_y2, _z2); \
702 _tF = _mm_unpackhi_pd(_y2, _z2); \
703 _tG = _mm_unpacklo_pd(_x3, _y3); \
704 _tH = _mm_unpackhi_pd(_x3, _y3); \
705 _tI = _mm_unpacklo_pd(_z3, _x4); \
706 _tJ = _mm_unpackhi_pd(_z3, _x4); \
707 _tK = _mm_unpacklo_pd(_y4, _z4); \
708 _tL = _mm_unpackhi_pd(_y4, _z4); \
709 _t1 = _mm_sub_pd(_t1, _tA); \
710 _t2 = _mm_sub_pd(_t2, _tC); \
711 _t3 = _mm_sub_pd(_t3, _tE); \
712 _t4 = _mm_sub_pd(_t4, _tG); \
713 _t5 = _mm_sub_pd(_t5, _tI); \
714 _t6 = _mm_sub_pd(_t6, _tK); \
715 _t7 = _mm_sub_pd(_t7, _tB); \
716 _t8 = _mm_sub_pd(_t8, _tD); \
717 _t9 = _mm_sub_pd(_t9, _tF); \
718 _t10 = _mm_sub_pd(_t10, _tH); \
719 _t11 = _mm_sub_pd(_t11, _tJ); \
720 _t12 = _mm_sub_pd(_t12, _tL); \
721 _mm_storeu_pd(ptrA, _t1); \
722 _mm_storeu_pd(ptrA+2, _t2); \
723 _mm_storeu_pd(ptrA+4, _t3); \
724 _mm_storeu_pd(ptrA+6, _t4); \
725 _mm_storeu_pd(ptrA+8, _t5); \
726 _mm_storeu_pd(ptrA+10, _t6); \
727 _mm_storeu_pd(ptrB, _t7); \
728 _mm_storeu_pd(ptrB+2, _t8); \
729 _mm_storeu_pd(ptrB+4, _t9); \
730 _mm_storeu_pd(ptrB+6, _t10); \
731 _mm_storeu_pd(ptrB+8, _t11); \
732 _mm_storeu_pd(ptrB+10, _t12); \
735 /* Real function for sane compilers */
736 static gmx_inline void
737 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
738 __m128d x1, __m128d y1, __m128d z1,
739 __m128d x2, __m128d y2, __m128d z2,
740 __m128d x3, __m128d y3, __m128d z3,
741 __m128d x4, __m128d y4, __m128d z4)
743 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
744 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
746 t1 = _mm_loadu_pd(ptrA);
747 t2 = _mm_loadu_pd(ptrA+2);
748 t3 = _mm_loadu_pd(ptrA+4);
749 t4 = _mm_loadu_pd(ptrA+6);
750 t5 = _mm_loadu_pd(ptrA+8);
751 t6 = _mm_loadu_pd(ptrA+10);
752 t7 = _mm_loadu_pd(ptrB);
753 t8 = _mm_loadu_pd(ptrB+2);
754 t9 = _mm_loadu_pd(ptrB+4);
755 t10 = _mm_loadu_pd(ptrB+6);
756 t11 = _mm_loadu_pd(ptrB+8);
757 t12 = _mm_loadu_pd(ptrB+10);
759 tA = _mm_unpacklo_pd(x1, y1);
760 tB = _mm_unpackhi_pd(x1, y1);
761 tC = _mm_unpacklo_pd(z1, x2);
762 tD = _mm_unpackhi_pd(z1, x2);
763 tE = _mm_unpacklo_pd(y2, z2);
764 tF = _mm_unpackhi_pd(y2, z2);
765 tG = _mm_unpacklo_pd(x3, y3);
766 tH = _mm_unpackhi_pd(x3, y3);
767 tI = _mm_unpacklo_pd(z3, x4);
768 tJ = _mm_unpackhi_pd(z3, x4);
769 tK = _mm_unpacklo_pd(y4, z4);
770 tL = _mm_unpackhi_pd(y4, z4);
772 t1 = _mm_sub_pd(t1, tA);
773 t2 = _mm_sub_pd(t2, tC);
774 t3 = _mm_sub_pd(t3, tE);
775 t4 = _mm_sub_pd(t4, tG);
776 t5 = _mm_sub_pd(t5, tI);
777 t6 = _mm_sub_pd(t6, tK);
779 t7 = _mm_sub_pd(t7, tB);
780 t8 = _mm_sub_pd(t8, tD);
781 t9 = _mm_sub_pd(t9, tF);
782 t10 = _mm_sub_pd(t10, tH);
783 t11 = _mm_sub_pd(t11, tJ);
784 t12 = _mm_sub_pd(t12, tL);
786 _mm_storeu_pd(ptrA, t1);
787 _mm_storeu_pd(ptrA+2, t2);
788 _mm_storeu_pd(ptrA+4, t3);
789 _mm_storeu_pd(ptrA+6, t4);
790 _mm_storeu_pd(ptrA+8, t5);
791 _mm_storeu_pd(ptrA+10, t6);
792 _mm_storeu_pd(ptrB, t7);
793 _mm_storeu_pd(ptrB+2, t8);
794 _mm_storeu_pd(ptrB+4, t9);
795 _mm_storeu_pd(ptrB+6, t10);
796 _mm_storeu_pd(ptrB+8, t11);
797 _mm_storeu_pd(ptrB+10, t12);
805 static gmx_inline void
806 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
807 double * gmx_restrict fptr,
808 double * gmx_restrict fshiftptr)
814 fix1 = _mm_unpacklo_pd(fix1, fiy1); /* y0 x0 */
815 fiy1 = _mm_unpackhi_pd(t1, fiy1); /* y1 x1 */
817 fix1 = _mm_add_pd(fix1, fiy1);
818 fiz1 = _mm_add_sd( fiz1, _mm_unpackhi_pd(fiz1, fiz1 ));
820 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
821 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
823 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
824 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
827 #if defined (_MSC_VER) && defined(_M_IX86)
828 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
829 #define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
833 GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
834 GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
835 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
837 fix3 = _mm_unpacklo_pd(fix3, fiy3); \
838 fiy3 = _mm_unpackhi_pd(_t1, fiy3); \
839 fix1 = _mm_add_pd(fix1, fiy1); \
840 fiz1 = _mm_add_pd(fiz1, fix2); \
841 fiy2 = _mm_add_pd(fiy2, fiz2); \
842 fix3 = _mm_add_pd(fix3, fiy3); \
843 fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3)); \
844 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
845 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
846 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
847 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
848 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
849 fix1 = _mm_add_pd(fix1, fix3); \
850 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
851 fix1 = _mm_add_pd(fix1, _t1); \
852 _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
853 fiz1 = _mm_add_sd(fiz1, fiz3); \
854 fiz1 = _mm_add_sd(fiz1, _t2); \
855 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
856 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
859 /* Real function for sane compilers */
860 static gmx_inline void
861 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
862 __m128d fix2, __m128d fiy2, __m128d fiz2,
863 __m128d fix3, __m128d fiy3, __m128d fiz3,
864 double * gmx_restrict fptr,
865 double * gmx_restrict fshiftptr)
870 GMX_MM_TRANSPOSE2_PD(fix1, fiy1);
871 GMX_MM_TRANSPOSE2_PD(fiz1, fix2);
872 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2);
874 fix3 = _mm_unpacklo_pd(fix3, fiy3); /* y0 x0 */
875 fiy3 = _mm_unpackhi_pd(t1, fiy3); /* y1 x1 */
877 fix1 = _mm_add_pd(fix1, fiy1);
878 fiz1 = _mm_add_pd(fiz1, fix2);
879 fiy2 = _mm_add_pd(fiy2, fiz2);
881 fix3 = _mm_add_pd(fix3, fiy3);
882 fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3));
884 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
885 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
886 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
887 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
888 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
890 fix1 = _mm_add_pd(fix1, fix3);
891 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
892 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
894 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
895 fiz1 = _mm_add_sd(fiz1, fiz3);
896 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
898 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
899 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
904 #if defined (_MSC_VER) && defined(_M_IX86)
905 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
906 #define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
910 GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
911 GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
912 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
913 GMX_MM_TRANSPOSE2_PD(fix3, fiy3); \
914 GMX_MM_TRANSPOSE2_PD(fiz3, fix4); \
915 GMX_MM_TRANSPOSE2_PD(fiy4, fiz4); \
916 fix1 = _mm_add_pd(fix1, fiy1); \
917 fiz1 = _mm_add_pd(fiz1, fix2); \
918 fiy2 = _mm_add_pd(fiy2, fiz2); \
919 fix3 = _mm_add_pd(fix3, fiy3); \
920 fiz3 = _mm_add_pd(fiz3, fix4); \
921 fiy4 = _mm_add_pd(fiy4, fiz4); \
922 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
923 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
924 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
925 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
926 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
927 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
928 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
929 fix1 = _mm_add_pd(fix1, _t1); \
930 _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
931 fix3 = _mm_add_pd(fix3, _t2); \
932 fix1 = _mm_add_pd(fix1, fix3); \
933 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
934 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
935 fiz1 = _mm_add_sd(fiz1, fiz3); \
936 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
937 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
940 /* Real function for sane compilers */
941 static gmx_inline void
942 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
943 __m128d fix2, __m128d fiy2, __m128d fiz2,
944 __m128d fix3, __m128d fiy3, __m128d fiz3,
945 __m128d fix4, __m128d fiy4, __m128d fiz4,
946 double * gmx_restrict fptr,
947 double * gmx_restrict fshiftptr)
952 GMX_MM_TRANSPOSE2_PD(fix1, fiy1);
953 GMX_MM_TRANSPOSE2_PD(fiz1, fix2);
954 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2);
955 GMX_MM_TRANSPOSE2_PD(fix3, fiy3);
956 GMX_MM_TRANSPOSE2_PD(fiz3, fix4);
957 GMX_MM_TRANSPOSE2_PD(fiy4, fiz4);
959 fix1 = _mm_add_pd(fix1, fiy1);
960 fiz1 = _mm_add_pd(fiz1, fix2);
961 fiy2 = _mm_add_pd(fiy2, fiz2);
962 fix3 = _mm_add_pd(fix3, fiy3);
963 fiz3 = _mm_add_pd(fiz3, fix4);
964 fiy4 = _mm_add_pd(fiy4, fiz4);
966 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
967 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
968 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
969 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
970 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
971 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
973 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
974 fix1 = _mm_add_pd(fix1, t1);
975 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
976 fix3 = _mm_add_pd(fix3, t2);
977 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
979 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
980 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
981 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
983 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
984 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
989 static gmx_inline void
990 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
992 pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1, pot1));
993 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
996 static gmx_inline void
997 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
998 __m128d pot2, double * gmx_restrict ptrB)
1000 GMX_MM_TRANSPOSE2_PD(pot1, pot2);
1001 pot1 = _mm_add_pd(pot1, pot2);
1002 pot2 = _mm_unpackhi_pd(pot1, pot1);
1004 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
1005 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
1009 #endif /* _kernelutil_x86_sse2_double_h_ */