2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_sse2_double_h_
36 #define _kernelutil_x86_sse2_double_h_
44 /* Normal sum of four ymm registers */
45 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
46 #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
47 #define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
49 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
50 __m128d __gmx_t1 = row0; \
51 row0 = _mm_unpacklo_pd(row0, row1); \
52 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
55 static gmx_inline int gmx_simdcall
56 gmx_mm_any_lt(__m128d a, __m128d b)
58 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
61 static gmx_inline __m128d gmx_simdcall
62 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
64 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
68 /* Load a double value from 1-2 places, merge into xmm register */
69 static gmx_inline __m128d gmx_simdcall
70 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
71 const double * gmx_restrict ptrB)
73 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
76 static gmx_inline __m128d gmx_simdcall
77 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
79 return _mm_load_sd(ptrA);
83 static gmx_inline void gmx_simdcall
84 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
85 double * gmx_restrict ptrB,
90 t2 = _mm_unpackhi_pd(xmm1, xmm1);
91 _mm_store_sd(ptrA, xmm1);
92 _mm_store_sd(ptrB, t2);
95 static gmx_inline void gmx_simdcall
96 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
98 _mm_store_sd(ptrA, xmm1);
102 /* Similar to store, but increments value in memory */
103 static gmx_inline void gmx_simdcall
104 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
105 double * gmx_restrict ptrB, __m128d xmm1)
109 t1 = _mm_unpackhi_pd(xmm1, xmm1);
110 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
111 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
112 _mm_store_sd(ptrA, xmm1);
113 _mm_store_sd(ptrB, t1);
116 static gmx_inline void gmx_simdcall
117 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
121 tmp = gmx_mm_load_1real_pd(ptrA);
122 tmp = _mm_add_sd(tmp, xmm1);
123 gmx_mm_store_1real_pd(ptrA, tmp);
127 static gmx_inline void gmx_simdcall
128 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
129 const double * gmx_restrict p2,
130 __m128d * gmx_restrict c6,
131 __m128d * gmx_restrict c12)
135 t1 = _mm_loadu_pd(p1);
136 t2 = _mm_loadu_pd(p2);
137 *c6 = _mm_unpacklo_pd(t1, t2);
138 *c12 = _mm_unpackhi_pd(t1, t2);
141 static gmx_inline void gmx_simdcall
142 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
143 __m128d * gmx_restrict c6,
144 __m128d * gmx_restrict c12)
146 *c6 = _mm_load_sd(p1);
147 *c12 = _mm_load_sd(p1+1);
152 static gmx_inline void gmx_simdcall
153 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
154 const double * gmx_restrict xyz,
155 __m128d * gmx_restrict x1,
156 __m128d * gmx_restrict y1,
157 __m128d * gmx_restrict z1)
159 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
161 mem_xy = _mm_loadu_pd(xyz);
162 mem_z = _mm_load_sd(xyz+2);
163 mem_sxy = _mm_loadu_pd(xyz_shift);
164 mem_sz = _mm_load_sd(xyz_shift+2);
166 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
167 mem_z = _mm_add_pd(mem_z, mem_sz);
169 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
170 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
171 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
175 static gmx_inline void gmx_simdcall
176 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
177 const double * gmx_restrict xyz,
178 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
179 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
180 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
182 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
184 t1 = _mm_loadu_pd(xyz);
185 t2 = _mm_loadu_pd(xyz+2);
186 t3 = _mm_loadu_pd(xyz+4);
187 t4 = _mm_loadu_pd(xyz+6);
188 t5 = _mm_load_sd(xyz+8);
190 sxy = _mm_loadu_pd(xyz_shift);
191 sz = _mm_load_sd(xyz_shift+2);
192 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
193 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
195 t1 = _mm_add_pd(t1, sxy);
196 t2 = _mm_add_pd(t2, szx);
197 t3 = _mm_add_pd(t3, syz);
198 t4 = _mm_add_pd(t4, sxy);
199 t5 = _mm_add_sd(t5, sz);
201 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
202 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
203 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
204 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
205 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
206 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
207 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
208 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
209 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
213 static gmx_inline void gmx_simdcall
214 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
215 const double * gmx_restrict xyz,
216 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
217 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
218 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
219 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
221 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
223 t1 = _mm_loadu_pd(xyz);
224 t2 = _mm_loadu_pd(xyz+2);
225 t3 = _mm_loadu_pd(xyz+4);
226 t4 = _mm_loadu_pd(xyz+6);
227 t5 = _mm_loadu_pd(xyz+8);
228 t6 = _mm_loadu_pd(xyz+10);
230 sxy = _mm_loadu_pd(xyz_shift);
231 sz = _mm_load_sd(xyz_shift+2);
232 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
233 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
235 t1 = _mm_add_pd(t1, sxy);
236 t2 = _mm_add_pd(t2, szx);
237 t3 = _mm_add_pd(t3, syz);
238 t4 = _mm_add_pd(t4, sxy);
239 t5 = _mm_add_pd(t5, szx);
240 t6 = _mm_add_pd(t6, syz);
242 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
243 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
244 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
245 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
246 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
247 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
248 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
249 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
250 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
251 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
252 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
253 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
259 static gmx_inline void gmx_simdcall
260 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
261 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
263 *x = _mm_load_sd(p1);
264 *y = _mm_load_sd(p1+1);
265 *z = _mm_load_sd(p1+2);
268 static gmx_inline void gmx_simdcall
269 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
270 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
271 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
272 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
274 *x1 = _mm_load_sd(p1);
275 *y1 = _mm_load_sd(p1+1);
276 *z1 = _mm_load_sd(p1+2);
277 *x2 = _mm_load_sd(p1+3);
278 *y2 = _mm_load_sd(p1+4);
279 *z2 = _mm_load_sd(p1+5);
280 *x3 = _mm_load_sd(p1+6);
281 *y3 = _mm_load_sd(p1+7);
282 *z3 = _mm_load_sd(p1+8);
285 static gmx_inline void gmx_simdcall
286 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
287 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
288 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
289 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
290 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
292 *x1 = _mm_load_sd(p1);
293 *y1 = _mm_load_sd(p1+1);
294 *z1 = _mm_load_sd(p1+2);
295 *x2 = _mm_load_sd(p1+3);
296 *y2 = _mm_load_sd(p1+4);
297 *z2 = _mm_load_sd(p1+5);
298 *x3 = _mm_load_sd(p1+6);
299 *y3 = _mm_load_sd(p1+7);
300 *z3 = _mm_load_sd(p1+8);
301 *x4 = _mm_load_sd(p1+9);
302 *y4 = _mm_load_sd(p1+10);
303 *z4 = _mm_load_sd(p1+11);
307 static gmx_inline void gmx_simdcall
308 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
309 const double * gmx_restrict ptrB,
310 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
312 __m128d t1, t2, t3, t4;
313 t1 = _mm_loadu_pd(ptrA);
314 t2 = _mm_loadu_pd(ptrB);
315 t3 = _mm_load_sd(ptrA+2);
316 t4 = _mm_load_sd(ptrB+2);
317 GMX_MM_TRANSPOSE2_PD(t1, t2);
320 *z1 = _mm_unpacklo_pd(t3, t4);
324 static gmx_inline void gmx_simdcall
325 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
326 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
327 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
328 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
330 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
331 t1 = _mm_loadu_pd(ptrA);
332 t2 = _mm_loadu_pd(ptrB);
333 t3 = _mm_loadu_pd(ptrA+2);
334 t4 = _mm_loadu_pd(ptrB+2);
335 t5 = _mm_loadu_pd(ptrA+4);
336 t6 = _mm_loadu_pd(ptrB+4);
337 t7 = _mm_loadu_pd(ptrA+6);
338 t8 = _mm_loadu_pd(ptrB+6);
339 t9 = _mm_load_sd(ptrA+8);
340 t10 = _mm_load_sd(ptrB+8);
341 GMX_MM_TRANSPOSE2_PD(t1, t2);
342 GMX_MM_TRANSPOSE2_PD(t3, t4);
343 GMX_MM_TRANSPOSE2_PD(t5, t6);
344 GMX_MM_TRANSPOSE2_PD(t7, t8);
353 *z3 = _mm_unpacklo_pd(t9, t10);
357 static gmx_inline void gmx_simdcall
358 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
359 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
360 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
361 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
362 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
364 __m128d t1, t2, t3, t4, t5, t6;
365 t1 = _mm_loadu_pd(ptrA);
366 t2 = _mm_loadu_pd(ptrB);
367 t3 = _mm_loadu_pd(ptrA+2);
368 t4 = _mm_loadu_pd(ptrB+2);
369 t5 = _mm_loadu_pd(ptrA+4);
370 t6 = _mm_loadu_pd(ptrB+4);
371 GMX_MM_TRANSPOSE2_PD(t1, t2);
372 GMX_MM_TRANSPOSE2_PD(t3, t4);
373 GMX_MM_TRANSPOSE2_PD(t5, t6);
380 t1 = _mm_loadu_pd(ptrA+6);
381 t2 = _mm_loadu_pd(ptrB+6);
382 t3 = _mm_loadu_pd(ptrA+8);
383 t4 = _mm_loadu_pd(ptrB+8);
384 t5 = _mm_loadu_pd(ptrA+10);
385 t6 = _mm_loadu_pd(ptrB+10);
386 GMX_MM_TRANSPOSE2_PD(t1, t2);
387 GMX_MM_TRANSPOSE2_PD(t3, t4);
388 GMX_MM_TRANSPOSE2_PD(t5, t6);
398 /* Routines to decrement rvec in memory, typically use for j particle force updates */
399 static gmx_inline void gmx_simdcall
400 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
401 __m128d xy, __m128d z)
405 t1 = _mm_loadu_pd(ptrA);
406 t2 = _mm_load_sd(ptrA+2);
408 t1 = _mm_sub_pd(t1, xy);
409 t2 = _mm_sub_sd(t2, z);
411 _mm_storeu_pd(ptrA, t1);
412 _mm_store_sd(ptrA+2, t2);
416 static gmx_inline void gmx_simdcall
417 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
418 __m128d x1, __m128d y1, __m128d z1)
422 t1 = _mm_load_sd(ptrA);
423 t2 = _mm_load_sd(ptrA+1);
424 t3 = _mm_load_sd(ptrA+2);
426 t1 = _mm_sub_sd(t1, x1);
427 t2 = _mm_sub_sd(t2, y1);
428 t3 = _mm_sub_sd(t3, z1);
429 _mm_store_sd(ptrA, t1);
430 _mm_store_sd(ptrA+1, t2);
431 _mm_store_sd(ptrA+2, t3);
435 /* Real function for sane compilers */
436 static gmx_inline void gmx_simdcall
437 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
438 __m128d x1, __m128d y1, __m128d z1,
439 __m128d x2, __m128d y2, __m128d z2,
440 __m128d x3, __m128d y3, __m128d z3)
442 __m128d t1, t2, t3, t4, t5;
444 t1 = _mm_loadu_pd(ptrA);
445 t2 = _mm_loadu_pd(ptrA+2);
446 t3 = _mm_loadu_pd(ptrA+4);
447 t4 = _mm_loadu_pd(ptrA+6);
448 t5 = _mm_load_sd(ptrA+8);
450 x1 = _mm_unpacklo_pd(x1, y1);
451 z1 = _mm_unpacklo_pd(z1, x2);
452 y2 = _mm_unpacklo_pd(y2, z2);
453 x3 = _mm_unpacklo_pd(x3, y3);
454 /* nothing to be done for z3 */
456 t1 = _mm_sub_pd(t1, x1);
457 t2 = _mm_sub_pd(t2, z1);
458 t3 = _mm_sub_pd(t3, y2);
459 t4 = _mm_sub_pd(t4, x3);
460 t5 = _mm_sub_sd(t5, z3);
461 _mm_storeu_pd(ptrA, t1);
462 _mm_storeu_pd(ptrA+2, t2);
463 _mm_storeu_pd(ptrA+4, t3);
464 _mm_storeu_pd(ptrA+6, t4);
465 _mm_store_sd(ptrA+8, t5);
468 static gmx_inline void gmx_simdcall
469 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
470 __m128d x1, __m128d y1, __m128d z1,
471 __m128d x2, __m128d y2, __m128d z2,
472 __m128d x3, __m128d y3, __m128d z3,
473 __m128d x4, __m128d y4, __m128d z4)
475 __m128d t1, t2, t3, t4, t5, t6;
477 t1 = _mm_loadu_pd(ptrA);
478 t2 = _mm_loadu_pd(ptrA+2);
479 t3 = _mm_loadu_pd(ptrA+4);
480 t4 = _mm_loadu_pd(ptrA+6);
481 t5 = _mm_loadu_pd(ptrA+8);
482 t6 = _mm_loadu_pd(ptrA+10);
484 x1 = _mm_unpacklo_pd(x1, y1);
485 z1 = _mm_unpacklo_pd(z1, x2);
486 y2 = _mm_unpacklo_pd(y2, z2);
487 x3 = _mm_unpacklo_pd(x3, y3);
488 z3 = _mm_unpacklo_pd(z3, x4);
489 y4 = _mm_unpacklo_pd(y4, z4);
491 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
492 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
493 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
494 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
495 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
496 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
500 static gmx_inline void gmx_simdcall
501 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
502 __m128d x1, __m128d y1, __m128d z1)
504 __m128d t1, t2, t3, t4, t5, t6, t7;
506 t1 = _mm_loadu_pd(ptrA);
507 t2 = _mm_load_sd(ptrA+2);
508 t3 = _mm_loadu_pd(ptrB);
509 t4 = _mm_load_sd(ptrB+2);
511 t5 = _mm_unpacklo_pd(x1, y1);
512 t6 = _mm_unpackhi_pd(x1, y1);
513 t7 = _mm_unpackhi_pd(z1, z1);
515 t1 = _mm_sub_pd(t1, t5);
516 t2 = _mm_sub_sd(t2, z1);
518 t3 = _mm_sub_pd(t3, t6);
519 t4 = _mm_sub_sd(t4, t7);
521 _mm_storeu_pd(ptrA, t1);
522 _mm_store_sd(ptrA+2, t2);
523 _mm_storeu_pd(ptrB, t3);
524 _mm_store_sd(ptrB+2, t4);
528 /* Real function for sane compilers */
529 static gmx_inline void gmx_simdcall
530 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
531 __m128d x1, __m128d y1, __m128d z1,
532 __m128d x2, __m128d y2, __m128d z2,
533 __m128d x3, __m128d y3, __m128d z3)
535 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
536 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
538 t1 = _mm_loadu_pd(ptrA);
539 t2 = _mm_loadu_pd(ptrA+2);
540 t3 = _mm_loadu_pd(ptrA+4);
541 t4 = _mm_loadu_pd(ptrA+6);
542 t5 = _mm_load_sd(ptrA+8);
543 t6 = _mm_loadu_pd(ptrB);
544 t7 = _mm_loadu_pd(ptrB+2);
545 t8 = _mm_loadu_pd(ptrB+4);
546 t9 = _mm_loadu_pd(ptrB+6);
547 t10 = _mm_load_sd(ptrB+8);
549 tA = _mm_unpacklo_pd(x1, y1);
550 tB = _mm_unpackhi_pd(x1, y1);
551 tC = _mm_unpacklo_pd(z1, x2);
552 tD = _mm_unpackhi_pd(z1, x2);
553 tE = _mm_unpacklo_pd(y2, z2);
554 tF = _mm_unpackhi_pd(y2, z2);
555 tG = _mm_unpacklo_pd(x3, y3);
556 tH = _mm_unpackhi_pd(x3, y3);
557 tI = _mm_unpackhi_pd(z3, z3);
559 t1 = _mm_sub_pd(t1, tA);
560 t2 = _mm_sub_pd(t2, tC);
561 t3 = _mm_sub_pd(t3, tE);
562 t4 = _mm_sub_pd(t4, tG);
563 t5 = _mm_sub_sd(t5, z3);
565 t6 = _mm_sub_pd(t6, tB);
566 t7 = _mm_sub_pd(t7, tD);
567 t8 = _mm_sub_pd(t8, tF);
568 t9 = _mm_sub_pd(t9, tH);
569 t10 = _mm_sub_sd(t10, tI);
571 _mm_storeu_pd(ptrA, t1);
572 _mm_storeu_pd(ptrA+2, t2);
573 _mm_storeu_pd(ptrA+4, t3);
574 _mm_storeu_pd(ptrA+6, t4);
575 _mm_store_sd(ptrA+8, t5);
576 _mm_storeu_pd(ptrB, t6);
577 _mm_storeu_pd(ptrB+2, t7);
578 _mm_storeu_pd(ptrB+4, t8);
579 _mm_storeu_pd(ptrB+6, t9);
580 _mm_store_sd(ptrB+8, t10);
583 static gmx_inline void gmx_simdcall
584 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
585 __m128d x1, __m128d y1, __m128d z1,
586 __m128d x2, __m128d y2, __m128d z2,
587 __m128d x3, __m128d y3, __m128d z3,
588 __m128d x4, __m128d y4, __m128d z4)
590 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
591 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
593 t1 = _mm_loadu_pd(ptrA);
594 t2 = _mm_loadu_pd(ptrA+2);
595 t3 = _mm_loadu_pd(ptrA+4);
596 t4 = _mm_loadu_pd(ptrA+6);
597 t5 = _mm_loadu_pd(ptrA+8);
598 t6 = _mm_loadu_pd(ptrA+10);
599 t7 = _mm_loadu_pd(ptrB);
600 t8 = _mm_loadu_pd(ptrB+2);
601 t9 = _mm_loadu_pd(ptrB+4);
602 t10 = _mm_loadu_pd(ptrB+6);
603 t11 = _mm_loadu_pd(ptrB+8);
604 t12 = _mm_loadu_pd(ptrB+10);
606 tA = _mm_unpacklo_pd(x1, y1);
607 tB = _mm_unpackhi_pd(x1, y1);
608 tC = _mm_unpacklo_pd(z1, x2);
609 tD = _mm_unpackhi_pd(z1, x2);
610 tE = _mm_unpacklo_pd(y2, z2);
611 tF = _mm_unpackhi_pd(y2, z2);
612 tG = _mm_unpacklo_pd(x3, y3);
613 tH = _mm_unpackhi_pd(x3, y3);
614 tI = _mm_unpacklo_pd(z3, x4);
615 tJ = _mm_unpackhi_pd(z3, x4);
616 tK = _mm_unpacklo_pd(y4, z4);
617 tL = _mm_unpackhi_pd(y4, z4);
619 t1 = _mm_sub_pd(t1, tA);
620 t2 = _mm_sub_pd(t2, tC);
621 t3 = _mm_sub_pd(t3, tE);
622 t4 = _mm_sub_pd(t4, tG);
623 t5 = _mm_sub_pd(t5, tI);
624 t6 = _mm_sub_pd(t6, tK);
626 t7 = _mm_sub_pd(t7, tB);
627 t8 = _mm_sub_pd(t8, tD);
628 t9 = _mm_sub_pd(t9, tF);
629 t10 = _mm_sub_pd(t10, tH);
630 t11 = _mm_sub_pd(t11, tJ);
631 t12 = _mm_sub_pd(t12, tL);
633 _mm_storeu_pd(ptrA, t1);
634 _mm_storeu_pd(ptrA+2, t2);
635 _mm_storeu_pd(ptrA+4, t3);
636 _mm_storeu_pd(ptrA+6, t4);
637 _mm_storeu_pd(ptrA+8, t5);
638 _mm_storeu_pd(ptrA+10, t6);
639 _mm_storeu_pd(ptrB, t7);
640 _mm_storeu_pd(ptrB+2, t8);
641 _mm_storeu_pd(ptrB+4, t9);
642 _mm_storeu_pd(ptrB+6, t10);
643 _mm_storeu_pd(ptrB+8, t11);
644 _mm_storeu_pd(ptrB+10, t12);
648 static gmx_inline void gmx_simdcall
649 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
650 double * gmx_restrict fptr,
651 double * gmx_restrict fshiftptr)
657 fix1 = _mm_unpacklo_pd(fix1, fiy1); /* y0 x0 */
658 fiy1 = _mm_unpackhi_pd(t1, fiy1); /* y1 x1 */
660 fix1 = _mm_add_pd(fix1, fiy1);
661 fiz1 = _mm_add_sd( fiz1, _mm_unpackhi_pd(fiz1, fiz1 ));
663 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
664 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
666 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
667 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
670 static gmx_inline void gmx_simdcall
671 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
672 __m128d fix2, __m128d fiy2, __m128d fiz2,
673 __m128d fix3, __m128d fiy3, __m128d fiz3,
674 double * gmx_restrict fptr,
675 double * gmx_restrict fshiftptr)
680 GMX_MM_TRANSPOSE2_PD(fix1, fiy1);
681 GMX_MM_TRANSPOSE2_PD(fiz1, fix2);
682 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2);
684 fix3 = _mm_unpacklo_pd(fix3, fiy3); /* y0 x0 */
685 fiy3 = _mm_unpackhi_pd(t1, fiy3); /* y1 x1 */
687 fix1 = _mm_add_pd(fix1, fiy1);
688 fiz1 = _mm_add_pd(fiz1, fix2);
689 fiy2 = _mm_add_pd(fiy2, fiz2);
691 fix3 = _mm_add_pd(fix3, fiy3);
692 fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3));
694 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
695 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
696 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
697 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
698 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
700 fix1 = _mm_add_pd(fix1, fix3);
701 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
702 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
704 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
705 fiz1 = _mm_add_sd(fiz1, fiz3);
706 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
708 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
709 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
713 static gmx_inline void gmx_simdcall
714 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
715 __m128d fix2, __m128d fiy2, __m128d fiz2,
716 __m128d fix3, __m128d fiy3, __m128d fiz3,
717 __m128d fix4, __m128d fiy4, __m128d fiz4,
718 double * gmx_restrict fptr,
719 double * gmx_restrict fshiftptr)
724 GMX_MM_TRANSPOSE2_PD(fix1, fiy1);
725 GMX_MM_TRANSPOSE2_PD(fiz1, fix2);
726 GMX_MM_TRANSPOSE2_PD(fiy2, fiz2);
727 GMX_MM_TRANSPOSE2_PD(fix3, fiy3);
728 GMX_MM_TRANSPOSE2_PD(fiz3, fix4);
729 GMX_MM_TRANSPOSE2_PD(fiy4, fiz4);
731 fix1 = _mm_add_pd(fix1, fiy1);
732 fiz1 = _mm_add_pd(fiz1, fix2);
733 fiy2 = _mm_add_pd(fiy2, fiz2);
734 fix3 = _mm_add_pd(fix3, fiy3);
735 fiz3 = _mm_add_pd(fiz3, fix4);
736 fiy4 = _mm_add_pd(fiy4, fiz4);
738 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
739 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
740 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
741 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
742 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
743 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
745 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
746 fix1 = _mm_add_pd(fix1, t1);
747 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
748 fix3 = _mm_add_pd(fix3, t2);
749 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
751 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
752 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
753 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
755 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
756 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
761 static gmx_inline void gmx_simdcall
762 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
764 pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1, pot1));
765 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
768 static gmx_inline void gmx_simdcall
769 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
770 __m128d pot2, double * gmx_restrict ptrB)
772 GMX_MM_TRANSPOSE2_PD(pot1, pot2);
773 pot1 = _mm_add_pd(pot1, pot2);
774 pot2 = _mm_unpackhi_pd(pot1, pot1);
776 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
777 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
781 #endif /* _kernelutil_x86_sse2_double_h_ */