2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_sse2_double_h_
36 #define _kernelutil_x86_sse2_double_h_
44 /* Normal sum of four ymm registers */
45 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
46 #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
47 #define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
49 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
50 __m128d __gmx_t1 = row0; \
51 row0 = _mm_unpacklo_pd(row0, row1); \
52 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
56 gmx_mm_any_lt(__m128d a, __m128d b)
58 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
61 static gmx_inline __m128d
62 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
64 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
68 /* Load a double value from 1-2 places, merge into xmm register */
69 static gmx_inline __m128d
70 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
71 const double * gmx_restrict ptrB)
73 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
76 static gmx_inline __m128d
77 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
79 return _mm_load_sd(ptrA);
83 static gmx_inline void
84 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
85 double * gmx_restrict ptrB,
90 t2 = _mm_unpackhi_pd(xmm1, xmm1);
91 _mm_store_sd(ptrA, xmm1);
92 _mm_store_sd(ptrB, t2);
95 static gmx_inline void
96 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
98 _mm_store_sd(ptrA, xmm1);
102 /* Similar to store, but increments value in memory */
103 static gmx_inline void
104 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
105 double * gmx_restrict ptrB, __m128d xmm1)
109 t1 = _mm_unpackhi_pd(xmm1, xmm1);
110 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
111 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
112 _mm_store_sd(ptrA, xmm1);
113 _mm_store_sd(ptrB, t1);
116 static gmx_inline void
117 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
121 tmp = gmx_mm_load_1real_pd(ptrA);
122 tmp = _mm_add_sd(tmp, xmm1);
123 gmx_mm_store_1real_pd(ptrA, tmp);
127 static gmx_inline void
128 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
129 const double * gmx_restrict p2,
130 __m128d * gmx_restrict c6,
131 __m128d * gmx_restrict c12)
135 t1 = _mm_loadu_pd(p1);
136 t2 = _mm_loadu_pd(p2);
137 *c6 = _mm_unpacklo_pd(t1, t2);
138 *c12 = _mm_unpackhi_pd(t1, t2);
141 static gmx_inline void
142 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
143 __m128d * gmx_restrict c6,
144 __m128d * gmx_restrict c12)
146 *c6 = _mm_load_sd(p1);
147 *c12 = _mm_load_sd(p1+1);
152 static gmx_inline void
153 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
154 const double * gmx_restrict xyz,
155 __m128d * gmx_restrict x1,
156 __m128d * gmx_restrict y1,
157 __m128d * gmx_restrict z1)
159 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
161 mem_xy = _mm_loadu_pd(xyz);
162 mem_z = _mm_load_sd(xyz+2);
163 mem_sxy = _mm_loadu_pd(xyz_shift);
164 mem_sz = _mm_load_sd(xyz_shift+2);
166 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
167 mem_z = _mm_add_pd(mem_z, mem_sz);
169 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
170 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
171 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
175 static gmx_inline void
176 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
177 const double * gmx_restrict xyz,
178 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
179 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
180 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
182 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
184 t1 = _mm_loadu_pd(xyz);
185 t2 = _mm_loadu_pd(xyz+2);
186 t3 = _mm_loadu_pd(xyz+4);
187 t4 = _mm_loadu_pd(xyz+6);
188 t5 = _mm_load_sd(xyz+8);
190 sxy = _mm_loadu_pd(xyz_shift);
191 sz = _mm_load_sd(xyz_shift+2);
192 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
193 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
195 t1 = _mm_add_pd(t1, sxy);
196 t2 = _mm_add_pd(t2, szx);
197 t3 = _mm_add_pd(t3, syz);
198 t4 = _mm_add_pd(t4, sxy);
199 t5 = _mm_add_sd(t5, sz);
201 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
202 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
203 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
204 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
205 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
206 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
207 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
208 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
209 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
213 static gmx_inline void
214 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
215 const double * gmx_restrict xyz,
216 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
217 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
218 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
219 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
221 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
223 t1 = _mm_loadu_pd(xyz);
224 t2 = _mm_loadu_pd(xyz+2);
225 t3 = _mm_loadu_pd(xyz+4);
226 t4 = _mm_loadu_pd(xyz+6);
227 t5 = _mm_loadu_pd(xyz+8);
228 t6 = _mm_loadu_pd(xyz+10);
230 sxy = _mm_loadu_pd(xyz_shift);
231 sz = _mm_load_sd(xyz_shift+2);
232 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
233 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
235 t1 = _mm_add_pd(t1, sxy);
236 t2 = _mm_add_pd(t2, szx);
237 t3 = _mm_add_pd(t3, syz);
238 t4 = _mm_add_pd(t4, sxy);
239 t5 = _mm_add_pd(t5, szx);
240 t6 = _mm_add_pd(t6, syz);
242 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
243 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
244 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
245 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
246 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
247 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
248 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
249 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
250 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
251 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
252 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
253 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
259 static gmx_inline void
260 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
261 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
263 *x = _mm_load_sd(p1);
264 *y = _mm_load_sd(p1+1);
265 *z = _mm_load_sd(p1+2);
268 static gmx_inline void
269 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
270 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
271 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
272 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
274 *x1 = _mm_load_sd(p1);
275 *y1 = _mm_load_sd(p1+1);
276 *z1 = _mm_load_sd(p1+2);
277 *x2 = _mm_load_sd(p1+3);
278 *y2 = _mm_load_sd(p1+4);
279 *z2 = _mm_load_sd(p1+5);
280 *x3 = _mm_load_sd(p1+6);
281 *y3 = _mm_load_sd(p1+7);
282 *z3 = _mm_load_sd(p1+8);
285 static gmx_inline void
286 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
287 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
288 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
289 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
290 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
292 *x1 = _mm_load_sd(p1);
293 *y1 = _mm_load_sd(p1+1);
294 *z1 = _mm_load_sd(p1+2);
295 *x2 = _mm_load_sd(p1+3);
296 *y2 = _mm_load_sd(p1+4);
297 *z2 = _mm_load_sd(p1+5);
298 *x3 = _mm_load_sd(p1+6);
299 *y3 = _mm_load_sd(p1+7);
300 *z3 = _mm_load_sd(p1+8);
301 *x4 = _mm_load_sd(p1+9);
302 *y4 = _mm_load_sd(p1+10);
303 *z4 = _mm_load_sd(p1+11);
307 static gmx_inline void
308 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
309 const double * gmx_restrict ptrB,
310 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
312 __m128d t1, t2, t3, t4;
313 t1 = _mm_loadu_pd(ptrA);
314 t2 = _mm_loadu_pd(ptrB);
315 t3 = _mm_load_sd(ptrA+2);
316 t4 = _mm_load_sd(ptrB+2);
317 GMX_MM_TRANSPOSE2_PD(t1, t2);
320 *z1 = _mm_unpacklo_pd(t3, t4);
324 static gmx_inline void
325 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
326 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
327 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
328 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
330 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
331 t1 = _mm_loadu_pd(ptrA);
332 t2 = _mm_loadu_pd(ptrB);
333 t3 = _mm_loadu_pd(ptrA+2);
334 t4 = _mm_loadu_pd(ptrB+2);
335 t5 = _mm_loadu_pd(ptrA+4);
336 t6 = _mm_loadu_pd(ptrB+4);
337 t7 = _mm_loadu_pd(ptrA+6);
338 t8 = _mm_loadu_pd(ptrB+6);
339 t9 = _mm_load_sd(ptrA+8);
340 t10 = _mm_load_sd(ptrB+8);
341 GMX_MM_TRANSPOSE2_PD(t1, t2);
342 GMX_MM_TRANSPOSE2_PD(t3, t4);
343 GMX_MM_TRANSPOSE2_PD(t5, t6);
344 GMX_MM_TRANSPOSE2_PD(t7, t8);
353 *z3 = _mm_unpacklo_pd(t9, t10);
357 static gmx_inline void
358 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
359 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
360 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
361 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
362 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
364 __m128d t1, t2, t3, t4, t5, t6;
365 t1 = _mm_loadu_pd(ptrA);
366 t2 = _mm_loadu_pd(ptrB);
367 t3 = _mm_loadu_pd(ptrA+2);
368 t4 = _mm_loadu_pd(ptrB+2);
369 t5 = _mm_loadu_pd(ptrA+4);
370 t6 = _mm_loadu_pd(ptrB+4);
371 GMX_MM_TRANSPOSE2_PD(t1, t2);
372 GMX_MM_TRANSPOSE2_PD(t3, t4);
373 GMX_MM_TRANSPOSE2_PD(t5, t6);
380 t1 = _mm_loadu_pd(ptrA+6);
381 t2 = _mm_loadu_pd(ptrB+6);
382 t3 = _mm_loadu_pd(ptrA+8);
383 t4 = _mm_loadu_pd(ptrB+8);
384 t5 = _mm_loadu_pd(ptrA+10);
385 t6 = _mm_loadu_pd(ptrB+10);
386 GMX_MM_TRANSPOSE2_PD(t1, t2);
387 GMX_MM_TRANSPOSE2_PD(t3, t4);
388 GMX_MM_TRANSPOSE2_PD(t5, t6);
398 /* Routines to decrement rvec in memory, typically use for j particle force updates */
399 static gmx_inline void
400 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
401 __m128d xy, __m128d z)
405 t1 = _mm_loadu_pd(ptrA);
406 t2 = _mm_load_sd(ptrA+2);
408 t1 = _mm_sub_pd(t1, xy);
409 t2 = _mm_sub_sd(t2, z);
411 _mm_storeu_pd(ptrA, t1);
412 _mm_store_sd(ptrA+2, t2);
416 static gmx_inline void
417 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
418 __m128d x1, __m128d y1, __m128d z1)
422 t1 = _mm_load_sd(ptrA);
423 t2 = _mm_load_sd(ptrA+1);
424 t3 = _mm_load_sd(ptrA+2);
426 t1 = _mm_sub_sd(t1, x1);
427 t2 = _mm_sub_sd(t2, y1);
428 t3 = _mm_sub_sd(t3, z1);
429 _mm_store_sd(ptrA, t1);
430 _mm_store_sd(ptrA+1, t2);
431 _mm_store_sd(ptrA+2, t3);
435 #if defined (_MSC_VER) && defined(_M_IX86)
436 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
437 #define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
439 __m128d _t1, _t2, _t3, _t4, _t5; \
440 _t1 = _mm_loadu_pd(ptrA); \
441 _t2 = _mm_loadu_pd(ptrA+2); \
442 _t3 = _mm_loadu_pd(ptrA+4); \
443 _t4 = _mm_loadu_pd(ptrA+6); \
444 _t5 = _mm_load_sd(ptrA+8); \
445 _x1 = _mm_unpacklo_pd(_x1, _y1); \
446 _z1 = _mm_unpacklo_pd(_z1, _x2); \
447 _y2 = _mm_unpacklo_pd(_y2, _z2); \
448 _x3 = _mm_unpacklo_pd(_x3, _y3); \
449 _t1 = _mm_sub_pd(_t1, _x1); \
450 _t2 = _mm_sub_pd(_t2, _z1); \
451 _t3 = _mm_sub_pd(_t3, _y2); \
452 _t4 = _mm_sub_pd(_t4, _x3); \
453 _t5 = _mm_sub_sd(_t5, _z3); \
454 _mm_storeu_pd(ptrA, _t1); \
455 _mm_storeu_pd(ptrA+2, _t2); \
456 _mm_storeu_pd(ptrA+4, _t3); \
457 _mm_storeu_pd(ptrA+6, _t4); \
458 _mm_store_sd(ptrA+8, _t5); \
461 /* Real function for sane compilers */
462 static gmx_inline void
463 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
464 __m128d x1, __m128d y1, __m128d z1,
465 __m128d x2, __m128d y2, __m128d z2,
466 __m128d x3, __m128d y3, __m128d z3)
468 __m128d t1, t2, t3, t4, t5;
470 t1 = _mm_loadu_pd(ptrA);
471 t2 = _mm_loadu_pd(ptrA+2);
472 t3 = _mm_loadu_pd(ptrA+4);
473 t4 = _mm_loadu_pd(ptrA+6);
474 t5 = _mm_load_sd(ptrA+8);
476 x1 = _mm_unpacklo_pd(x1, y1);
477 z1 = _mm_unpacklo_pd(z1, x2);
478 y2 = _mm_unpacklo_pd(y2, z2);
479 x3 = _mm_unpacklo_pd(x3, y3);
480 /* nothing to be done for z3 */
482 t1 = _mm_sub_pd(t1, x1);
483 t2 = _mm_sub_pd(t2, z1);
484 t3 = _mm_sub_pd(t3, y2);
485 t4 = _mm_sub_pd(t4, x3);
486 t5 = _mm_sub_sd(t5, z3);
487 _mm_storeu_pd(ptrA, t1);
488 _mm_storeu_pd(ptrA+2, t2);
489 _mm_storeu_pd(ptrA+4, t3);
490 _mm_storeu_pd(ptrA+6, t4);
491 _mm_store_sd(ptrA+8, t5);
496 #if defined (_MSC_VER) && defined(_M_IX86)
497 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
498 #define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
500 __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
501 _t1 = _mm_loadu_pd(ptrA); \
502 _t2 = _mm_loadu_pd(ptrA+2); \
503 _t3 = _mm_loadu_pd(ptrA+4); \
504 _t4 = _mm_loadu_pd(ptrA+6); \
505 _t5 = _mm_loadu_pd(ptrA+8); \
506 _t6 = _mm_loadu_pd(ptrA+10); \
507 _x1 = _mm_unpacklo_pd(_x1, _y1); \
508 _z1 = _mm_unpacklo_pd(_z1, _x2); \
509 _y2 = _mm_unpacklo_pd(_y2, _z2); \
510 _x3 = _mm_unpacklo_pd(_x3, _y3); \
511 _z3 = _mm_unpacklo_pd(_z3, _x4); \
512 _y4 = _mm_unpacklo_pd(_y4, _z4); \
513 _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
514 _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
515 _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
516 _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
517 _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
518 _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
521 /* Real function for sane compilers */
522 static gmx_inline void
523 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
524 __m128d x1, __m128d y1, __m128d z1,
525 __m128d x2, __m128d y2, __m128d z2,
526 __m128d x3, __m128d y3, __m128d z3,
527 __m128d x4, __m128d y4, __m128d z4)
529 __m128d t1, t2, t3, t4, t5, t6;
531 t1 = _mm_loadu_pd(ptrA);
532 t2 = _mm_loadu_pd(ptrA+2);
533 t3 = _mm_loadu_pd(ptrA+4);
534 t4 = _mm_loadu_pd(ptrA+6);
535 t5 = _mm_loadu_pd(ptrA+8);
536 t6 = _mm_loadu_pd(ptrA+10);
538 x1 = _mm_unpacklo_pd(x1, y1);
539 z1 = _mm_unpacklo_pd(z1, x2);
540 y2 = _mm_unpacklo_pd(y2, z2);
541 x3 = _mm_unpacklo_pd(x3, y3);
542 z3 = _mm_unpacklo_pd(z3, x4);
543 y4 = _mm_unpacklo_pd(y4, z4);
545 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
546 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
547 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
548 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
549 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
550 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
555 static gmx_inline void
556 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
557 __m128d x1, __m128d y1, __m128d z1)
559 __m128d t1, t2, t3, t4, t5, t6, t7;
561 t1 = _mm_loadu_pd(ptrA);
562 t2 = _mm_load_sd(ptrA+2);
563 t3 = _mm_loadu_pd(ptrB);
564 t4 = _mm_load_sd(ptrB+2);
566 t5 = _mm_unpacklo_pd(x1, y1);
567 t6 = _mm_unpackhi_pd(x1, y1);
568 t7 = _mm_unpackhi_pd(z1, z1);
570 t1 = _mm_sub_pd(t1, t5);
571 t2 = _mm_sub_sd(t2, z1);
573 t3 = _mm_sub_pd(t3, t6);
574 t4 = _mm_sub_sd(t4, t7);
576 _mm_storeu_pd(ptrA, t1);
577 _mm_store_sd(ptrA+2, t2);
578 _mm_storeu_pd(ptrB, t3);
579 _mm_store_sd(ptrB+2, t4);
582 #if defined (_MSC_VER) && defined(_M_IX86)
583 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
584 #define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
586 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
587 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
588 _t1 = _mm_loadu_pd(ptrA); \
589 _t2 = _mm_loadu_pd(ptrA+2); \
590 _t3 = _mm_loadu_pd(ptrA+4); \
591 _t4 = _mm_loadu_pd(ptrA+6); \
592 _t5 = _mm_load_sd(ptrA+8); \
593 _t6 = _mm_loadu_pd(ptrB); \
594 _t7 = _mm_loadu_pd(ptrB+2); \
595 _t8 = _mm_loadu_pd(ptrB+4); \
596 _t9 = _mm_loadu_pd(ptrB+6); \
597 _t10 = _mm_load_sd(ptrB+8); \
598 _tA = _mm_unpacklo_pd(_x1, _y1); \
599 _tB = _mm_unpackhi_pd(_x1, _y1); \
600 _tC = _mm_unpacklo_pd(_z1, _x2); \
601 _tD = _mm_unpackhi_pd(_z1, _x2); \
602 _tE = _mm_unpacklo_pd(_y2, _z2); \
603 _tF = _mm_unpackhi_pd(_y2, _z2); \
604 _tG = _mm_unpacklo_pd(_x3, _y3); \
605 _tH = _mm_unpackhi_pd(_x3, _y3); \
606 _tI = _mm_unpackhi_pd(_z3, _z3); \
607 _t1 = _mm_sub_pd(_t1, _tA); \
608 _t2 = _mm_sub_pd(_t2, _tC); \
609 _t3 = _mm_sub_pd(_t3, _tE); \
610 _t4 = _mm_sub_pd(_t4, _tG); \
611 _t5 = _mm_sub_sd(_t5, _z3); \
612 _t6 = _mm_sub_pd(_t6, _tB); \
613 _t7 = _mm_sub_pd(_t7, _tD); \
614 _t8 = _mm_sub_pd(_t8, _tF); \
615 _t9 = _mm_sub_pd(_t9, _tH); \
616 _t10 = _mm_sub_sd(_t10, _tI); \
617 _mm_storeu_pd(ptrA, _t1); \
618 _mm_storeu_pd(ptrA+2, _t2); \
619 _mm_storeu_pd(ptrA+4, _t3); \
620 _mm_storeu_pd(ptrA+6, _t4); \
621 _mm_store_sd(ptrA+8, _t5); \
622 _mm_storeu_pd(ptrB, _t6); \
623 _mm_storeu_pd(ptrB+2, _t7); \
624 _mm_storeu_pd(ptrB+4, _t8); \
625 _mm_storeu_pd(ptrB+6, _t9); \
626 _mm_store_sd(ptrB+8, _t10); \
629 /* Real function for sane compilers */
630 static gmx_inline void
631 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
632 __m128d x1, __m128d y1, __m128d z1,
633 __m128d x2, __m128d y2, __m128d z2,
634 __m128d x3, __m128d y3, __m128d z3)
636 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
637 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
639 t1 = _mm_loadu_pd(ptrA);
640 t2 = _mm_loadu_pd(ptrA+2);
641 t3 = _mm_loadu_pd(ptrA+4);
642 t4 = _mm_loadu_pd(ptrA+6);
643 t5 = _mm_load_sd(ptrA+8);
644 t6 = _mm_loadu_pd(ptrB);
645 t7 = _mm_loadu_pd(ptrB+2);
646 t8 = _mm_loadu_pd(ptrB+4);
647 t9 = _mm_loadu_pd(ptrB+6);
648 t10 = _mm_load_sd(ptrB+8);
650 tA = _mm_unpacklo_pd(x1, y1);
651 tB = _mm_unpackhi_pd(x1, y1);
652 tC = _mm_unpacklo_pd(z1, x2);
653 tD = _mm_unpackhi_pd(z1, x2);
654 tE = _mm_unpacklo_pd(y2, z2);
655 tF = _mm_unpackhi_pd(y2, z2);
656 tG = _mm_unpacklo_pd(x3, y3);
657 tH = _mm_unpackhi_pd(x3, y3);
658 tI = _mm_unpackhi_pd(z3, z3);
660 t1 = _mm_sub_pd(t1, tA);
661 t2 = _mm_sub_pd(t2, tC);
662 t3 = _mm_sub_pd(t3, tE);
663 t4 = _mm_sub_pd(t4, tG);
664 t5 = _mm_sub_sd(t5, z3);
666 t6 = _mm_sub_pd(t6, tB);
667 t7 = _mm_sub_pd(t7, tD);
668 t8 = _mm_sub_pd(t8, tF);
669 t9 = _mm_sub_pd(t9, tH);
670 t10 = _mm_sub_sd(t10, tI);
672 _mm_storeu_pd(ptrA, t1);
673 _mm_storeu_pd(ptrA+2, t2);
674 _mm_storeu_pd(ptrA+4, t3);
675 _mm_storeu_pd(ptrA+6, t4);
676 _mm_store_sd(ptrA+8, t5);
677 _mm_storeu_pd(ptrB, t6);
678 _mm_storeu_pd(ptrB+2, t7);
679 _mm_storeu_pd(ptrB+4, t8);
680 _mm_storeu_pd(ptrB+6, t9);
681 _mm_store_sd(ptrB+8, t10);
686 #if defined (_MSC_VER) && defined(_M_IX86)
687 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
688 #define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
690 __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
691 __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
692 _t1 = _mm_loadu_pd(ptrA); \
693 _t2 = _mm_loadu_pd(ptrA+2); \
694 _t3 = _mm_loadu_pd(ptrA+4); \
695 _t4 = _mm_loadu_pd(ptrA+6); \
696 _t5 = _mm_loadu_pd(ptrA+8); \
697 _t6 = _mm_loadu_pd(ptrA+10); \
698 _t7 = _mm_loadu_pd(ptrB); \
699 _t8 = _mm_loadu_pd(ptrB+2); \
700 _t9 = _mm_loadu_pd(ptrB+4); \
701 _t10 = _mm_loadu_pd(ptrB+6); \
702 _t11 = _mm_loadu_pd(ptrB+8); \
703 _t12 = _mm_loadu_pd(ptrB+10); \
704 _tA = _mm_unpacklo_pd(_x1, _y1); \
705 _tB = _mm_unpackhi_pd(_x1, _y1); \
706 _tC = _mm_unpacklo_pd(_z1, _x2); \
707 _tD = _mm_unpackhi_pd(_z1, _x2); \
708 _tE = _mm_unpacklo_pd(_y2, _z2); \
709 _tF = _mm_unpackhi_pd(_y2, _z2); \
710 _tG = _mm_unpacklo_pd(_x3, _y3); \
711 _tH = _mm_unpackhi_pd(_x3, _y3); \
712 _tI = _mm_unpacklo_pd(_z3, _x4); \
713 _tJ = _mm_unpackhi_pd(_z3, _x4); \
714 _tK = _mm_unpacklo_pd(_y4, _z4); \
715 _tL = _mm_unpackhi_pd(_y4, _z4); \
716 _t1 = _mm_sub_pd(_t1, _tA); \
717 _t2 = _mm_sub_pd(_t2, _tC); \
718 _t3 = _mm_sub_pd(_t3, _tE); \
719 _t4 = _mm_sub_pd(_t4, _tG); \
720 _t5 = _mm_sub_pd(_t5, _tI); \
721 _t6 = _mm_sub_pd(_t6, _tK); \
722 _t7 = _mm_sub_pd(_t7, _tB); \
723 _t8 = _mm_sub_pd(_t8, _tD); \
724 _t9 = _mm_sub_pd(_t9, _tF); \
725 _t10 = _mm_sub_pd(_t10, _tH); \
726 _t11 = _mm_sub_pd(_t11, _tJ); \
727 _t12 = _mm_sub_pd(_t12, _tL); \
728 _mm_storeu_pd(ptrA, _t1); \
729 _mm_storeu_pd(ptrA+2, _t2); \
730 _mm_storeu_pd(ptrA+4, _t3); \
731 _mm_storeu_pd(ptrA+6, _t4); \
732 _mm_storeu_pd(ptrA+8, _t5); \
733 _mm_storeu_pd(ptrA+10, _t6); \
734 _mm_storeu_pd(ptrB, _t7); \
735 _mm_storeu_pd(ptrB+2, _t8); \
736 _mm_storeu_pd(ptrB+4, _t9); \
737 _mm_storeu_pd(ptrB+6, _t10); \
738 _mm_storeu_pd(ptrB+8, _t11); \
739 _mm_storeu_pd(ptrB+10, _t12); \
742 /* Real function for sane compilers */
743 static gmx_inline void
744 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
745 __m128d x1, __m128d y1, __m128d z1,
746 __m128d x2, __m128d y2, __m128d z2,
747 __m128d x3, __m128d y3, __m128d z3,
748 __m128d x4, __m128d y4, __m128d z4)
750 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
751 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
753 t1 = _mm_loadu_pd(ptrA);
754 t2 = _mm_loadu_pd(ptrA+2);
755 t3 = _mm_loadu_pd(ptrA+4);
756 t4 = _mm_loadu_pd(ptrA+6);
757 t5 = _mm_loadu_pd(ptrA+8);
758 t6 = _mm_loadu_pd(ptrA+10);
759 t7 = _mm_loadu_pd(ptrB);
760 t8 = _mm_loadu_pd(ptrB+2);
761 t9 = _mm_loadu_pd(ptrB+4);
762 t10 = _mm_loadu_pd(ptrB+6);
763 t11 = _mm_loadu_pd(ptrB+8);
764 t12 = _mm_loadu_pd(ptrB+10);
766 tA = _mm_unpacklo_pd(x1, y1);
767 tB = _mm_unpackhi_pd(x1, y1);
768 tC = _mm_unpacklo_pd(z1, x2);
769 tD = _mm_unpackhi_pd(z1, x2);
770 tE = _mm_unpacklo_pd(y2, z2);
771 tF = _mm_unpackhi_pd(y2, z2);
772 tG = _mm_unpacklo_pd(x3, y3);
773 tH = _mm_unpackhi_pd(x3, y3);
774 tI = _mm_unpacklo_pd(z3, x4);
775 tJ = _mm_unpackhi_pd(z3, x4);
776 tK = _mm_unpacklo_pd(y4, z4);
777 tL = _mm_unpackhi_pd(y4, z4);
779 t1 = _mm_sub_pd(t1, tA);
780 t2 = _mm_sub_pd(t2, tC);
781 t3 = _mm_sub_pd(t3, tE);
782 t4 = _mm_sub_pd(t4, tG);
783 t5 = _mm_sub_pd(t5, tI);
784 t6 = _mm_sub_pd(t6, tK);
786 t7 = _mm_sub_pd(t7, tB);
787 t8 = _mm_sub_pd(t8, tD);
788 t9 = _mm_sub_pd(t9, tF);
789 t10 = _mm_sub_pd(t10, tH);
790 t11 = _mm_sub_pd(t11, tJ);
791 t12 = _mm_sub_pd(t12, tL);
793 _mm_storeu_pd(ptrA, t1);
794 _mm_storeu_pd(ptrA+2, t2);
795 _mm_storeu_pd(ptrA+4, t3);
796 _mm_storeu_pd(ptrA+6, t4);
797 _mm_storeu_pd(ptrA+8, t5);
798 _mm_storeu_pd(ptrA+10, t6);
799 _mm_storeu_pd(ptrB, t7);
800 _mm_storeu_pd(ptrB+2, t8);
801 _mm_storeu_pd(ptrB+4, t9);
802 _mm_storeu_pd(ptrB+6, t10);
803 _mm_storeu_pd(ptrB+8, t11);
804 _mm_storeu_pd(ptrB+10, t12);
812 static gmx_inline void
813 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
814 double * gmx_restrict fptr,
815 double * gmx_restrict fshiftptr)
821 fix1 = _mm_unpacklo_pd(fix1, fiy1); /* y0 x0 */
822 fiy1 = _mm_unpackhi_pd(t1, fiy1); /* y1 x1 */
824 fix1 = _mm_add_pd(fix1, fiy1);
825 fiz1 = _mm_add_sd( fiz1, _mm_unpackhi_pd(fiz1, fiz1 ));
827 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
828 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
830 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
831 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
834 #if defined (_MSC_VER) && defined(_M_IX86)
835 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
836 #define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
840 GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
841 GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
842 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
844 fix3 = _mm_unpacklo_pd(fix3, fiy3); \
845 fiy3 = _mm_unpackhi_pd(_t1, fiy3); \
846 fix1 = _mm_add_pd(fix1, fiy1); \
847 fiz1 = _mm_add_pd(fiz1, fix2); \
848 fiy2 = _mm_add_pd(fiy2, fiz2); \
849 fix3 = _mm_add_pd(fix3, fiy3); \
850 fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3)); \
851 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
852 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
853 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
854 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
855 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
856 fix1 = _mm_add_pd(fix1, fix3); \
857 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
858 fix1 = _mm_add_pd(fix1, _t1); \
859 _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
860 fiz1 = _mm_add_sd(fiz1, fiz3); \
861 fiz1 = _mm_add_sd(fiz1, _t2); \
862 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
863 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
866 /* Real function for sane compilers */
867 static gmx_inline void
868 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
869 __m128d fix2, __m128d fiy2, __m128d fiz2,
870 __m128d fix3, __m128d fiy3, __m128d fiz3,
871 double * gmx_restrict fptr,
872 double * gmx_restrict fshiftptr)
877 GMX_MM_TRANSPOSE2_PD(fix1, fiy1);
878 GMX_MM_TRANSPOSE2_PD(fiz1, fix2);
879 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2);
881 fix3 = _mm_unpacklo_pd(fix3, fiy3); /* y0 x0 */
882 fiy3 = _mm_unpackhi_pd(t1, fiy3); /* y1 x1 */
884 fix1 = _mm_add_pd(fix1, fiy1);
885 fiz1 = _mm_add_pd(fiz1, fix2);
886 fiy2 = _mm_add_pd(fiy2, fiz2);
888 fix3 = _mm_add_pd(fix3, fiy3);
889 fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3));
891 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
892 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
893 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
894 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
895 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
897 fix1 = _mm_add_pd(fix1, fix3);
898 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
899 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
901 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
902 fiz1 = _mm_add_sd(fiz1, fiz3);
903 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
905 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
906 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
911 #if defined (_MSC_VER) && defined(_M_IX86)
912 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
913 #define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
917 GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
918 GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
919 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
920 GMX_MM_TRANSPOSE2_PD(fix3, fiy3); \
921 GMX_MM_TRANSPOSE2_PD(fiz3, fix4); \
922 GMX_MM_TRANSPOSE2_PD(fiy4, fiz4); \
923 fix1 = _mm_add_pd(fix1, fiy1); \
924 fiz1 = _mm_add_pd(fiz1, fix2); \
925 fiy2 = _mm_add_pd(fiy2, fiz2); \
926 fix3 = _mm_add_pd(fix3, fiy3); \
927 fiz3 = _mm_add_pd(fiz3, fix4); \
928 fiy4 = _mm_add_pd(fiy4, fiz4); \
929 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
930 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
931 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
932 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
933 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
934 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
935 _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
936 fix1 = _mm_add_pd(fix1, _t1); \
937 _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
938 fix3 = _mm_add_pd(fix3, _t2); \
939 fix1 = _mm_add_pd(fix1, fix3); \
940 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
941 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
942 fiz1 = _mm_add_sd(fiz1, fiz3); \
943 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
944 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
947 /* Real function for sane compilers */
948 static gmx_inline void
949 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
950 __m128d fix2, __m128d fiy2, __m128d fiz2,
951 __m128d fix3, __m128d fiy3, __m128d fiz3,
952 __m128d fix4, __m128d fiy4, __m128d fiz4,
953 double * gmx_restrict fptr,
954 double * gmx_restrict fshiftptr)
959 GMX_MM_TRANSPOSE2_PD(fix1, fiy1);
960 GMX_MM_TRANSPOSE2_PD(fiz1, fix2);
961 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2);
962 GMX_MM_TRANSPOSE2_PD(fix3, fiy3);
963 GMX_MM_TRANSPOSE2_PD(fiz3, fix4);
964 GMX_MM_TRANSPOSE2_PD(fiy4, fiz4);
966 fix1 = _mm_add_pd(fix1, fiy1);
967 fiz1 = _mm_add_pd(fiz1, fix2);
968 fiy2 = _mm_add_pd(fiy2, fiz2);
969 fix3 = _mm_add_pd(fix3, fiy3);
970 fiz3 = _mm_add_pd(fiz3, fix4);
971 fiy4 = _mm_add_pd(fiy4, fiz4);
973 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
974 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
975 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
976 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
977 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
978 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
980 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
981 fix1 = _mm_add_pd(fix1, t1);
982 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
983 fix3 = _mm_add_pd(fix3, t2);
984 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
986 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
987 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
988 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
990 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
991 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
996 static gmx_inline void
997 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
999 pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1, pot1));
1000 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
1003 static gmx_inline void
1004 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
1005 __m128d pot2, double * gmx_restrict ptrB)
1007 GMX_MM_TRANSPOSE2_PD(pot1, pot2);
1008 pot1 = _mm_add_pd(pot1, pot2);
1009 pot2 = _mm_unpackhi_pd(pot1, pot1);
1011 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
1012 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
1016 #endif /* _kernelutil_x86_sse2_double_h_ */