2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_sse4_1_double_h_
36 #define _kernelutil_x86_sse4_1_double_h_
42 #define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
43 #define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
45 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
46 __m128d __gmx_t1 = row0; \
47 row0 = _mm_unpacklo_pd(row0, row1); \
48 row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
51 /* Normal sum of four ymm registers */
52 #define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
54 static gmx_inline int gmx_simdcall
55 gmx_mm_any_lt(__m128d a, __m128d b)
57 return _mm_movemask_pd(_mm_cmplt_pd(a, b));
60 static gmx_inline __m128d gmx_simdcall
61 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
63 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
67 /* Load a double value from 1-2 places, merge into xmm register */
68 static gmx_inline __m128d gmx_simdcall
69 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
70 const double * gmx_restrict ptrB)
72 return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
75 static gmx_inline __m128d gmx_simdcall
76 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
78 return _mm_load_sd(ptrA);
82 static gmx_inline void gmx_simdcall
83 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
84 double * gmx_restrict ptrB,
89 t2 = _mm_unpackhi_pd(xmm1, xmm1);
90 _mm_store_sd(ptrA, xmm1);
91 _mm_store_sd(ptrB, t2);
94 static gmx_inline void gmx_simdcall
95 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
97 _mm_store_sd(ptrA, xmm1);
101 /* Similar to store, but increments value in memory */
102 static gmx_inline void gmx_simdcall
103 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
104 double * gmx_restrict ptrB, __m128d xmm1)
108 t1 = _mm_unpackhi_pd(xmm1, xmm1);
109 xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
110 t1 = _mm_add_sd(t1, _mm_load_sd(ptrB));
111 _mm_store_sd(ptrA, xmm1);
112 _mm_store_sd(ptrB, t1);
115 static gmx_inline void gmx_simdcall
116 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
120 tmp = gmx_mm_load_1real_pd(ptrA);
121 tmp = _mm_add_sd(tmp, xmm1);
122 gmx_mm_store_1real_pd(ptrA, tmp);
126 static gmx_inline void gmx_simdcall
127 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
128 const double * gmx_restrict p2,
129 __m128d * gmx_restrict c6,
130 __m128d * gmx_restrict c12)
134 t1 = _mm_loadu_pd(p1);
135 t2 = _mm_loadu_pd(p2);
136 *c6 = _mm_unpacklo_pd(t1, t2);
137 *c12 = _mm_unpackhi_pd(t1, t2);
140 static gmx_inline void gmx_simdcall
141 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
142 __m128d * gmx_restrict c6,
143 __m128d * gmx_restrict c12)
145 *c6 = _mm_load_sd(p1);
146 *c12 = _mm_load_sd(p1+1);
151 static gmx_inline void gmx_simdcall
152 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
153 const double * gmx_restrict xyz,
154 __m128d * gmx_restrict x1,
155 __m128d * gmx_restrict y1,
156 __m128d * gmx_restrict z1)
158 __m128d mem_xy, mem_z, mem_sxy, mem_sz;
160 mem_xy = _mm_loadu_pd(xyz);
161 mem_z = _mm_load_sd(xyz+2);
162 mem_sxy = _mm_loadu_pd(xyz_shift);
163 mem_sz = _mm_load_sd(xyz_shift+2);
165 mem_xy = _mm_add_pd(mem_xy, mem_sxy);
166 mem_z = _mm_add_pd(mem_z, mem_sz);
168 *x1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
169 *y1 = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
170 *z1 = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
174 static gmx_inline void gmx_simdcall
175 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
176 const double * gmx_restrict xyz,
177 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
178 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
179 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
181 __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
183 t1 = _mm_loadu_pd(xyz);
184 t2 = _mm_loadu_pd(xyz+2);
185 t3 = _mm_loadu_pd(xyz+4);
186 t4 = _mm_loadu_pd(xyz+6);
187 t5 = _mm_load_sd(xyz+8);
189 sxy = _mm_loadu_pd(xyz_shift);
190 sz = _mm_load_sd(xyz_shift+2);
191 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
192 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
194 t1 = _mm_add_pd(t1, sxy);
195 t2 = _mm_add_pd(t2, szx);
196 t3 = _mm_add_pd(t3, syz);
197 t4 = _mm_add_pd(t4, sxy);
198 t5 = _mm_add_sd(t5, sz);
200 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
201 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
202 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
203 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
204 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
205 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
206 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
207 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
208 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
212 static gmx_inline void gmx_simdcall
213 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
214 const double * gmx_restrict xyz,
215 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
216 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
217 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
218 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
220 __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
222 t1 = _mm_loadu_pd(xyz);
223 t2 = _mm_loadu_pd(xyz+2);
224 t3 = _mm_loadu_pd(xyz+4);
225 t4 = _mm_loadu_pd(xyz+6);
226 t5 = _mm_loadu_pd(xyz+8);
227 t6 = _mm_loadu_pd(xyz+10);
229 sxy = _mm_loadu_pd(xyz_shift);
230 sz = _mm_load_sd(xyz_shift+2);
231 szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
232 syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
234 t1 = _mm_add_pd(t1, sxy);
235 t2 = _mm_add_pd(t2, szx);
236 t3 = _mm_add_pd(t3, syz);
237 t4 = _mm_add_pd(t4, sxy);
238 t5 = _mm_add_pd(t5, szx);
239 t6 = _mm_add_pd(t6, syz);
241 *x1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
242 *y1 = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
243 *z1 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
244 *x2 = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
245 *y2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
246 *z2 = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
247 *x3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
248 *y3 = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
249 *z3 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
250 *x4 = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
251 *y4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
252 *z4 = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
258 static gmx_inline void gmx_simdcall
259 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
260 __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
262 *x = _mm_load_sd(p1);
263 *y = _mm_load_sd(p1+1);
264 *z = _mm_load_sd(p1+2);
267 static gmx_inline void gmx_simdcall
268 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
269 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
270 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
271 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
273 *x1 = _mm_load_sd(p1);
274 *y1 = _mm_load_sd(p1+1);
275 *z1 = _mm_load_sd(p1+2);
276 *x2 = _mm_load_sd(p1+3);
277 *y2 = _mm_load_sd(p1+4);
278 *z2 = _mm_load_sd(p1+5);
279 *x3 = _mm_load_sd(p1+6);
280 *y3 = _mm_load_sd(p1+7);
281 *z3 = _mm_load_sd(p1+8);
284 static gmx_inline void gmx_simdcall
285 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
286 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
287 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
288 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
289 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
291 *x1 = _mm_load_sd(p1);
292 *y1 = _mm_load_sd(p1+1);
293 *z1 = _mm_load_sd(p1+2);
294 *x2 = _mm_load_sd(p1+3);
295 *y2 = _mm_load_sd(p1+4);
296 *z2 = _mm_load_sd(p1+5);
297 *x3 = _mm_load_sd(p1+6);
298 *y3 = _mm_load_sd(p1+7);
299 *z3 = _mm_load_sd(p1+8);
300 *x4 = _mm_load_sd(p1+9);
301 *y4 = _mm_load_sd(p1+10);
302 *z4 = _mm_load_sd(p1+11);
306 static gmx_inline void gmx_simdcall
307 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
308 const double * gmx_restrict ptrB,
309 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
311 __m128d t1, t2, t3, t4;
312 t1 = _mm_loadu_pd(ptrA);
313 t2 = _mm_loadu_pd(ptrB);
314 t3 = _mm_load_sd(ptrA+2);
315 t4 = _mm_load_sd(ptrB+2);
316 GMX_MM_TRANSPOSE2_PD(t1, t2);
319 *z1 = _mm_unpacklo_pd(t3, t4);
323 static gmx_inline void gmx_simdcall
324 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
325 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
326 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
327 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
329 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
330 t1 = _mm_loadu_pd(ptrA);
331 t2 = _mm_loadu_pd(ptrB);
332 t3 = _mm_loadu_pd(ptrA+2);
333 t4 = _mm_loadu_pd(ptrB+2);
334 t5 = _mm_loadu_pd(ptrA+4);
335 t6 = _mm_loadu_pd(ptrB+4);
336 t7 = _mm_loadu_pd(ptrA+6);
337 t8 = _mm_loadu_pd(ptrB+6);
338 t9 = _mm_load_sd(ptrA+8);
339 t10 = _mm_load_sd(ptrB+8);
340 GMX_MM_TRANSPOSE2_PD(t1, t2);
341 GMX_MM_TRANSPOSE2_PD(t3, t4);
342 GMX_MM_TRANSPOSE2_PD(t5, t6);
343 GMX_MM_TRANSPOSE2_PD(t7, t8);
352 *z3 = _mm_unpacklo_pd(t9, t10);
356 static gmx_inline void gmx_simdcall
357 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
358 __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
359 __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
360 __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
361 __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
363 __m128d t1, t2, t3, t4, t5, t6;
364 t1 = _mm_loadu_pd(ptrA);
365 t2 = _mm_loadu_pd(ptrB);
366 t3 = _mm_loadu_pd(ptrA+2);
367 t4 = _mm_loadu_pd(ptrB+2);
368 t5 = _mm_loadu_pd(ptrA+4);
369 t6 = _mm_loadu_pd(ptrB+4);
370 GMX_MM_TRANSPOSE2_PD(t1, t2);
371 GMX_MM_TRANSPOSE2_PD(t3, t4);
372 GMX_MM_TRANSPOSE2_PD(t5, t6);
379 t1 = _mm_loadu_pd(ptrA+6);
380 t2 = _mm_loadu_pd(ptrB+6);
381 t3 = _mm_loadu_pd(ptrA+8);
382 t4 = _mm_loadu_pd(ptrB+8);
383 t5 = _mm_loadu_pd(ptrA+10);
384 t6 = _mm_loadu_pd(ptrB+10);
385 GMX_MM_TRANSPOSE2_PD(t1, t2);
386 GMX_MM_TRANSPOSE2_PD(t3, t4);
387 GMX_MM_TRANSPOSE2_PD(t5, t6);
397 /* Routines to decrement rvec in memory, typically use for j particle force updates */
398 static gmx_inline void gmx_simdcall
399 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
400 __m128d xy, __m128d z)
404 t1 = _mm_loadu_pd(ptrA);
405 t2 = _mm_load_sd(ptrA+2);
407 t1 = _mm_sub_pd(t1, xy);
408 t2 = _mm_sub_sd(t2, z);
410 _mm_storeu_pd(ptrA, t1);
411 _mm_store_sd(ptrA+2, t2);
415 static gmx_inline void gmx_simdcall
416 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
417 __m128d x1, __m128d y1, __m128d z1)
421 t1 = _mm_load_sd(ptrA);
422 t2 = _mm_load_sd(ptrA+1);
423 t3 = _mm_load_sd(ptrA+2);
425 t1 = _mm_sub_sd(t1, x1);
426 t2 = _mm_sub_sd(t2, y1);
427 t3 = _mm_sub_sd(t3, z1);
428 _mm_store_sd(ptrA, t1);
429 _mm_store_sd(ptrA+1, t2);
430 _mm_store_sd(ptrA+2, t3);
434 static gmx_inline void gmx_simdcall
435 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
436 __m128d x1, __m128d y1, __m128d z1,
437 __m128d x2, __m128d y2, __m128d z2,
438 __m128d x3, __m128d y3, __m128d z3)
440 __m128d t1, t2, t3, t4, t5;
442 t1 = _mm_loadu_pd(ptrA);
443 t2 = _mm_loadu_pd(ptrA+2);
444 t3 = _mm_loadu_pd(ptrA+4);
445 t4 = _mm_loadu_pd(ptrA+6);
446 t5 = _mm_load_sd(ptrA+8);
448 x1 = _mm_unpacklo_pd(x1, y1);
449 z1 = _mm_unpacklo_pd(z1, x2);
450 y2 = _mm_unpacklo_pd(y2, z2);
451 x3 = _mm_unpacklo_pd(x3, y3);
452 /* nothing to be done for z3 */
454 t1 = _mm_sub_pd(t1, x1);
455 t2 = _mm_sub_pd(t2, z1);
456 t3 = _mm_sub_pd(t3, y2);
457 t4 = _mm_sub_pd(t4, x3);
458 t5 = _mm_sub_sd(t5, z3);
459 _mm_storeu_pd(ptrA, t1);
460 _mm_storeu_pd(ptrA+2, t2);
461 _mm_storeu_pd(ptrA+4, t3);
462 _mm_storeu_pd(ptrA+6, t4);
463 _mm_store_sd(ptrA+8, t5);
467 static gmx_inline void gmx_simdcall
468 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
469 __m128d x1, __m128d y1, __m128d z1,
470 __m128d x2, __m128d y2, __m128d z2,
471 __m128d x3, __m128d y3, __m128d z3,
472 __m128d x4, __m128d y4, __m128d z4)
474 __m128d t1, t2, t3, t4, t5, t6;
476 t1 = _mm_loadu_pd(ptrA);
477 t2 = _mm_loadu_pd(ptrA+2);
478 t3 = _mm_loadu_pd(ptrA+4);
479 t4 = _mm_loadu_pd(ptrA+6);
480 t5 = _mm_loadu_pd(ptrA+8);
481 t6 = _mm_loadu_pd(ptrA+10);
483 x1 = _mm_unpacklo_pd(x1, y1);
484 z1 = _mm_unpacklo_pd(z1, x2);
485 y2 = _mm_unpacklo_pd(y2, z2);
486 x3 = _mm_unpacklo_pd(x3, y3);
487 z3 = _mm_unpacklo_pd(z3, x4);
488 y4 = _mm_unpacklo_pd(y4, z4);
490 _mm_storeu_pd(ptrA, _mm_sub_pd( t1, x1 ));
491 _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2, z1 ));
492 _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3, y2 ));
493 _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4, x3 ));
494 _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
495 _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
499 static gmx_inline void gmx_simdcall
500 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
501 __m128d x1, __m128d y1, __m128d z1)
503 __m128d t1, t2, t3, t4, t5, t6, t7;
505 t1 = _mm_loadu_pd(ptrA);
506 t2 = _mm_load_sd(ptrA+2);
507 t3 = _mm_loadu_pd(ptrB);
508 t4 = _mm_load_sd(ptrB+2);
510 t5 = _mm_unpacklo_pd(x1, y1);
511 t6 = _mm_unpackhi_pd(x1, y1);
512 t7 = _mm_unpackhi_pd(z1, z1);
514 t1 = _mm_sub_pd(t1, t5);
515 t2 = _mm_sub_sd(t2, z1);
517 t3 = _mm_sub_pd(t3, t6);
518 t4 = _mm_sub_sd(t4, t7);
520 _mm_storeu_pd(ptrA, t1);
521 _mm_store_sd(ptrA+2, t2);
522 _mm_storeu_pd(ptrB, t3);
523 _mm_store_sd(ptrB+2, t4);
527 static gmx_inline void gmx_simdcall
528 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
529 __m128d x1, __m128d y1, __m128d z1,
530 __m128d x2, __m128d y2, __m128d z2,
531 __m128d x3, __m128d y3, __m128d z3)
533 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
534 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
536 t1 = _mm_loadu_pd(ptrA);
537 t2 = _mm_loadu_pd(ptrA+2);
538 t3 = _mm_loadu_pd(ptrA+4);
539 t4 = _mm_loadu_pd(ptrA+6);
540 t5 = _mm_load_sd(ptrA+8);
541 t6 = _mm_loadu_pd(ptrB);
542 t7 = _mm_loadu_pd(ptrB+2);
543 t8 = _mm_loadu_pd(ptrB+4);
544 t9 = _mm_loadu_pd(ptrB+6);
545 t10 = _mm_load_sd(ptrB+8);
547 tA = _mm_unpacklo_pd(x1, y1);
548 tB = _mm_unpackhi_pd(x1, y1);
549 tC = _mm_unpacklo_pd(z1, x2);
550 tD = _mm_unpackhi_pd(z1, x2);
551 tE = _mm_unpacklo_pd(y2, z2);
552 tF = _mm_unpackhi_pd(y2, z2);
553 tG = _mm_unpacklo_pd(x3, y3);
554 tH = _mm_unpackhi_pd(x3, y3);
555 tI = _mm_unpackhi_pd(z3, z3);
557 t1 = _mm_sub_pd(t1, tA);
558 t2 = _mm_sub_pd(t2, tC);
559 t3 = _mm_sub_pd(t3, tE);
560 t4 = _mm_sub_pd(t4, tG);
561 t5 = _mm_sub_sd(t5, z3);
563 t6 = _mm_sub_pd(t6, tB);
564 t7 = _mm_sub_pd(t7, tD);
565 t8 = _mm_sub_pd(t8, tF);
566 t9 = _mm_sub_pd(t9, tH);
567 t10 = _mm_sub_sd(t10, tI);
569 _mm_storeu_pd(ptrA, t1);
570 _mm_storeu_pd(ptrA+2, t2);
571 _mm_storeu_pd(ptrA+4, t3);
572 _mm_storeu_pd(ptrA+6, t4);
573 _mm_store_sd(ptrA+8, t5);
574 _mm_storeu_pd(ptrB, t6);
575 _mm_storeu_pd(ptrB+2, t7);
576 _mm_storeu_pd(ptrB+4, t8);
577 _mm_storeu_pd(ptrB+6, t9);
578 _mm_store_sd(ptrB+8, t10);
582 static gmx_inline void gmx_simdcall
583 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
584 __m128d x1, __m128d y1, __m128d z1,
585 __m128d x2, __m128d y2, __m128d z2,
586 __m128d x3, __m128d y3, __m128d z3,
587 __m128d x4, __m128d y4, __m128d z4)
589 __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
590 __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
592 t1 = _mm_loadu_pd(ptrA);
593 t2 = _mm_loadu_pd(ptrA+2);
594 t3 = _mm_loadu_pd(ptrA+4);
595 t4 = _mm_loadu_pd(ptrA+6);
596 t5 = _mm_loadu_pd(ptrA+8);
597 t6 = _mm_loadu_pd(ptrA+10);
598 t7 = _mm_loadu_pd(ptrB);
599 t8 = _mm_loadu_pd(ptrB+2);
600 t9 = _mm_loadu_pd(ptrB+4);
601 t10 = _mm_loadu_pd(ptrB+6);
602 t11 = _mm_loadu_pd(ptrB+8);
603 t12 = _mm_loadu_pd(ptrB+10);
605 tA = _mm_unpacklo_pd(x1, y1);
606 tB = _mm_unpackhi_pd(x1, y1);
607 tC = _mm_unpacklo_pd(z1, x2);
608 tD = _mm_unpackhi_pd(z1, x2);
609 tE = _mm_unpacklo_pd(y2, z2);
610 tF = _mm_unpackhi_pd(y2, z2);
611 tG = _mm_unpacklo_pd(x3, y3);
612 tH = _mm_unpackhi_pd(x3, y3);
613 tI = _mm_unpacklo_pd(z3, x4);
614 tJ = _mm_unpackhi_pd(z3, x4);
615 tK = _mm_unpacklo_pd(y4, z4);
616 tL = _mm_unpackhi_pd(y4, z4);
618 t1 = _mm_sub_pd(t1, tA);
619 t2 = _mm_sub_pd(t2, tC);
620 t3 = _mm_sub_pd(t3, tE);
621 t4 = _mm_sub_pd(t4, tG);
622 t5 = _mm_sub_pd(t5, tI);
623 t6 = _mm_sub_pd(t6, tK);
625 t7 = _mm_sub_pd(t7, tB);
626 t8 = _mm_sub_pd(t8, tD);
627 t9 = _mm_sub_pd(t9, tF);
628 t10 = _mm_sub_pd(t10, tH);
629 t11 = _mm_sub_pd(t11, tJ);
630 t12 = _mm_sub_pd(t12, tL);
632 _mm_storeu_pd(ptrA, t1);
633 _mm_storeu_pd(ptrA+2, t2);
634 _mm_storeu_pd(ptrA+4, t3);
635 _mm_storeu_pd(ptrA+6, t4);
636 _mm_storeu_pd(ptrA+8, t5);
637 _mm_storeu_pd(ptrA+10, t6);
638 _mm_storeu_pd(ptrB, t7);
639 _mm_storeu_pd(ptrB+2, t8);
640 _mm_storeu_pd(ptrB+4, t9);
641 _mm_storeu_pd(ptrB+6, t10);
642 _mm_storeu_pd(ptrB+8, t11);
643 _mm_storeu_pd(ptrB+10, t12);
647 static gmx_inline void gmx_simdcall
648 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
649 double * gmx_restrict fptr,
650 double * gmx_restrict fshiftptr)
652 fix1 = _mm_hadd_pd(fix1, fiy1);
653 fiz1 = _mm_hadd_pd(fiz1, fiz1);
655 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
656 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
658 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
659 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
664 static gmx_inline void gmx_simdcall
665 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
666 __m128d fix2, __m128d fiy2, __m128d fiz2,
667 __m128d fix3, __m128d fiy3, __m128d fiz3,
668 double * gmx_restrict fptr,
669 double * gmx_restrict fshiftptr)
673 fix1 = _mm_hadd_pd(fix1, fiy1);
674 fiz1 = _mm_hadd_pd(fiz1, fix2);
675 fiy2 = _mm_hadd_pd(fiy2, fiz2);
676 fix3 = _mm_hadd_pd(fix3, fiy3);
677 fiz3 = _mm_hadd_pd(fiz3, fiz3);
679 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
680 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
681 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
682 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
683 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
685 fix1 = _mm_add_pd(fix1, fix3);
686 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
687 fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
689 t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
690 fiz1 = _mm_add_sd(fiz1, fiz3);
691 fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
693 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
694 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
698 static gmx_inline void gmx_simdcall
699 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
700 __m128d fix2, __m128d fiy2, __m128d fiz2,
701 __m128d fix3, __m128d fiy3, __m128d fiz3,
702 __m128d fix4, __m128d fiy4, __m128d fiz4,
703 double * gmx_restrict fptr,
704 double * gmx_restrict fshiftptr)
708 fix1 = _mm_hadd_pd(fix1, fiy1);
709 fiz1 = _mm_hadd_pd(fiz1, fix2);
710 fiy2 = _mm_hadd_pd(fiy2, fiz2);
711 fix3 = _mm_hadd_pd(fix3, fiy3);
712 fiz3 = _mm_hadd_pd(fiz3, fix4);
713 fiy4 = _mm_hadd_pd(fiy4, fiz4);
715 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
716 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
717 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
718 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
719 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 ));
720 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
722 t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
723 fix1 = _mm_add_pd(fix1, t1);
724 t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
725 fix3 = _mm_add_pd(fix3, t2);
726 fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
728 fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
729 fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
730 fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
732 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
733 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
737 static gmx_inline void gmx_simdcall
738 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
740 pot1 = _mm_hadd_pd(pot1, pot1);
741 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
744 static gmx_inline void gmx_simdcall
745 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
746 __m128d pot2, double * gmx_restrict ptrB)
748 pot1 = _mm_hadd_pd(pot1, pot2);
749 pot2 = _mm_unpackhi_pd(pot1, pot1);
751 _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
752 _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
756 #endif /* _kernelutil_x86_sse4_1_double_h_ */