Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / kernelutil_x86_avx_128_fma_double.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 #ifndef _kernelutil_x86_avx_128_fma_double_h_
36 #define _kernelutil_x86_avx_128_fma_double_h_
37
38 #include <math.h>
39
40 #include <immintrin.h>
41 #ifdef _MSC_VER
42 #    include <intrin.h>
43 #else
44 #    include <x86intrin.h>
45 #endif
46
47 #include "config.h"
48
49 #define gmx_mm_castsi128_pd   _mm_castsi128_pd
50 #define gmx_mm_extract_epi32  _mm_extract_epi32
51
52 #define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
53         __m128d __gmx_t1 = row0;                         \
54         row0           = _mm_unpacklo_pd(row0, row1);     \
55         row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
56 }
57
58 static gmx_inline int gmx_simdcall
59 gmx_mm_any_lt(__m128d a, __m128d b)
60 {
61     return _mm_movemask_pd(_mm_cmplt_pd(a, b));
62 }
63
64
65 static gmx_inline __m128d gmx_simdcall
66 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
67 {
68     return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
69 }
70
71 /* Normal sum of four ymm registers */
72 #define gmx_mm_sum4_pd(t0, t1, t2, t3)  _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
73
74
75
76 /* Load a double value from 1-2 places, merge into xmm register */
77
78
79 static gmx_inline __m128d gmx_simdcall
80 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
81                              const double * gmx_restrict ptrB)
82 {
83     return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
84 }
85
86 static gmx_inline __m128d gmx_simdcall
87 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
88 {
89     return _mm_load_sd(ptrA);
90 }
91
92
93 static gmx_inline void gmx_simdcall
94 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
95                               double * gmx_restrict ptrB,
96                               __m128d               xmm1)
97 {
98     __m128d t2;
99
100     t2       = _mm_unpackhi_pd(xmm1, xmm1);
101     _mm_store_sd(ptrA, xmm1);
102     _mm_store_sd(ptrB, t2);
103 }
104
105 static gmx_inline void gmx_simdcall
106 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
107 {
108     _mm_store_sd(ptrA, xmm1);
109 }
110
111
112 /* Similar to store, but increments value in memory */
113 static gmx_inline void gmx_simdcall
114 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
115                                   double * gmx_restrict ptrB, __m128d xmm1)
116 {
117     __m128d t1;
118
119     t1   = _mm_unpackhi_pd(xmm1, xmm1);
120     xmm1 = _mm_add_sd(xmm1, _mm_load_sd(ptrA));
121     t1   = _mm_add_sd(t1, _mm_load_sd(ptrB));
122     _mm_store_sd(ptrA, xmm1);
123     _mm_store_sd(ptrB, t1);
124 }
125
126 static gmx_inline void gmx_simdcall
127 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
128 {
129     __m128d tmp;
130
131     tmp = gmx_mm_load_1real_pd(ptrA);
132     tmp = _mm_add_sd(tmp, xmm1);
133     gmx_mm_store_1real_pd(ptrA, tmp);
134 }
135
136
137
138 static gmx_inline void gmx_simdcall
139 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
140                              const double * gmx_restrict p2,
141                              __m128d * gmx_restrict      c6,
142                              __m128d * gmx_restrict      c12)
143 {
144     __m128d t1, t2, t3;
145
146     /* The c6/c12 array should be aligned */
147     t1   = _mm_loadu_pd(p1);
148     t2   = _mm_loadu_pd(p2);
149     *c6  = _mm_unpacklo_pd(t1, t2);
150     *c12 = _mm_unpackhi_pd(t1, t2);
151 }
152
153 static gmx_inline void gmx_simdcall
154 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
155                              __m128d * gmx_restrict      c6,
156                              __m128d * gmx_restrict      c12)
157 {
158     *c6     = _mm_load_sd(p1);
159     *c12    = _mm_load_sd(p1+1);
160 }
161
162
163 static gmx_inline void gmx_simdcall
164 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
165                                          const double * gmx_restrict xyz,
166                                          __m128d * gmx_restrict      x1,
167                                          __m128d * gmx_restrict      y1,
168                                          __m128d * gmx_restrict      z1)
169 {
170     __m128d mem_xy, mem_z, mem_sxy, mem_sz;
171
172     mem_xy  = _mm_loadu_pd(xyz);
173     mem_z   = _mm_load_sd(xyz+2);
174     mem_sxy = _mm_loadu_pd(xyz_shift);
175     mem_sz  = _mm_load_sd(xyz_shift+2);
176
177     mem_xy  = _mm_add_pd(mem_xy, mem_sxy);
178     mem_z   = _mm_add_pd(mem_z, mem_sz);
179
180     *x1  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(0, 0));
181     *y1  = _mm_shuffle_pd(mem_xy, mem_xy, _MM_SHUFFLE2(1, 1));
182     *z1  = _mm_shuffle_pd(mem_z, mem_z, _MM_SHUFFLE2(0, 0));
183 }
184
185
186 static gmx_inline void gmx_simdcall
187 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
188                                          const double * gmx_restrict xyz,
189                                          __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
190                                          __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
191                                          __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
192 {
193     __m128d t1, t2, t3, t4, t5, sxy, sz, szx, syz;
194
195     t1  = _mm_loadu_pd(xyz);
196     t2  = _mm_loadu_pd(xyz+2);
197     t3  = _mm_loadu_pd(xyz+4);
198     t4  = _mm_loadu_pd(xyz+6);
199     t5  = _mm_load_sd(xyz+8);
200
201     sxy = _mm_loadu_pd(xyz_shift);
202     sz  = _mm_load_sd(xyz_shift+2);
203     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
204     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
205
206     t1  = _mm_add_pd(t1, sxy);
207     t2  = _mm_add_pd(t2, szx);
208     t3  = _mm_add_pd(t3, syz);
209     t4  = _mm_add_pd(t4, sxy);
210     t5  = _mm_add_sd(t5, sz);
211
212     *x1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
213     *y1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
214     *z1  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
215     *x2  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
216     *y2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
217     *z2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
218     *x3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
219     *y3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
220     *z3  = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
221 }
222
223
224 static gmx_inline void gmx_simdcall
225 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
226                                          const double * gmx_restrict xyz,
227                                          __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
228                                          __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
229                                          __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
230                                          __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
231 {
232     __m128d t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
233
234     t1  = _mm_loadu_pd(xyz);
235     t2  = _mm_loadu_pd(xyz+2);
236     t3  = _mm_loadu_pd(xyz+4);
237     t4  = _mm_loadu_pd(xyz+6);
238     t5  = _mm_loadu_pd(xyz+8);
239     t6  = _mm_loadu_pd(xyz+10);
240
241     sxy = _mm_loadu_pd(xyz_shift);
242     sz  = _mm_load_sd(xyz_shift+2);
243     szx = _mm_shuffle_pd(sz, sxy, _MM_SHUFFLE2(0, 0));
244     syz = _mm_shuffle_pd(sxy, sz, _MM_SHUFFLE2(0, 1));
245
246     t1  = _mm_add_pd(t1, sxy);
247     t2  = _mm_add_pd(t2, szx);
248     t3  = _mm_add_pd(t3, syz);
249     t4  = _mm_add_pd(t4, sxy);
250     t5  = _mm_add_pd(t5, szx);
251     t6  = _mm_add_pd(t6, syz);
252
253     *x1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(0, 0));
254     *y1  = _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1));
255     *z1  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(0, 0));
256     *x2  = _mm_shuffle_pd(t2, t2, _MM_SHUFFLE2(1, 1));
257     *y2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(0, 0));
258     *z2  = _mm_shuffle_pd(t3, t3, _MM_SHUFFLE2(1, 1));
259     *x3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(0, 0));
260     *y3  = _mm_shuffle_pd(t4, t4, _MM_SHUFFLE2(1, 1));
261     *z3  = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(0, 0));
262     *x4  = _mm_shuffle_pd(t5, t5, _MM_SHUFFLE2(1, 1));
263     *y4  = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(0, 0));
264     *z4  = _mm_shuffle_pd(t6, t6, _MM_SHUFFLE2(1, 1));
265 }
266
267
268
269 static gmx_inline void gmx_simdcall
270 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
271                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
272 {
273     *x            = _mm_load_sd(p1);
274     *y            = _mm_load_sd(p1+1);
275     *z            = _mm_load_sd(p1+2);
276 }
277
278 static gmx_inline void gmx_simdcall
279 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
280                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
281                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
282                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
283 {
284     *x1            = _mm_load_sd(p1);
285     *y1            = _mm_load_sd(p1+1);
286     *z1            = _mm_load_sd(p1+2);
287     *x2            = _mm_load_sd(p1+3);
288     *y2            = _mm_load_sd(p1+4);
289     *z2            = _mm_load_sd(p1+5);
290     *x3            = _mm_load_sd(p1+6);
291     *y3            = _mm_load_sd(p1+7);
292     *z3            = _mm_load_sd(p1+8);
293 }
294
295 static gmx_inline void gmx_simdcall
296 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
297                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
298                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
299                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
300                                   __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
301 {
302     *x1            = _mm_load_sd(p1);
303     *y1            = _mm_load_sd(p1+1);
304     *z1            = _mm_load_sd(p1+2);
305     *x2            = _mm_load_sd(p1+3);
306     *y2            = _mm_load_sd(p1+4);
307     *z2            = _mm_load_sd(p1+5);
308     *x3            = _mm_load_sd(p1+6);
309     *y3            = _mm_load_sd(p1+7);
310     *z3            = _mm_load_sd(p1+8);
311     *x4            = _mm_load_sd(p1+9);
312     *y4            = _mm_load_sd(p1+10);
313     *z4            = _mm_load_sd(p1+11);
314 }
315
316
317 static gmx_inline void gmx_simdcall
318 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
319                                   const double * gmx_restrict ptrB,
320                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
321 {
322     __m128d t1, t2, t3, t4;
323     t1           = _mm_loadu_pd(ptrA);
324     t2           = _mm_loadu_pd(ptrB);
325     t3           = _mm_load_sd(ptrA+2);
326     t4           = _mm_load_sd(ptrB+2);
327     GMX_MM_TRANSPOSE2_PD(t1, t2);
328     *x1          = t1;
329     *y1          = t2;
330     *z1          = _mm_unpacklo_pd(t3, t4);
331 }
332
333 static gmx_inline void gmx_simdcall
334 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
335                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
336                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
337                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
338 {
339     __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
340     t1           = _mm_loadu_pd(ptrA);
341     t2           = _mm_loadu_pd(ptrB);
342     t3           = _mm_loadu_pd(ptrA+2);
343     t4           = _mm_loadu_pd(ptrB+2);
344     t5           = _mm_loadu_pd(ptrA+4);
345     t6           = _mm_loadu_pd(ptrB+4);
346     t7           = _mm_loadu_pd(ptrA+6);
347     t8           = _mm_loadu_pd(ptrB+6);
348     t9           = _mm_load_sd(ptrA+8);
349     t10          = _mm_load_sd(ptrB+8);
350     GMX_MM_TRANSPOSE2_PD(t1, t2);
351     GMX_MM_TRANSPOSE2_PD(t3, t4);
352     GMX_MM_TRANSPOSE2_PD(t5, t6);
353     GMX_MM_TRANSPOSE2_PD(t7, t8);
354     *x1          = t1;
355     *y1          = t2;
356     *z1          = t3;
357     *x2          = t4;
358     *y2          = t5;
359     *z2          = t6;
360     *x3          = t7;
361     *y3          = t8;
362     *z3          = _mm_unpacklo_pd(t9, t10);
363 }
364
365
366 static gmx_inline void gmx_simdcall
367 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
368                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
369                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
370                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
371                                   __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
372 {
373     __m128d t1, t2, t3, t4, t5, t6;
374     t1           = _mm_loadu_pd(ptrA);
375     t2           = _mm_loadu_pd(ptrB);
376     t3           = _mm_loadu_pd(ptrA+2);
377     t4           = _mm_loadu_pd(ptrB+2);
378     t5           = _mm_loadu_pd(ptrA+4);
379     t6           = _mm_loadu_pd(ptrB+4);
380     GMX_MM_TRANSPOSE2_PD(t1, t2);
381     GMX_MM_TRANSPOSE2_PD(t3, t4);
382     GMX_MM_TRANSPOSE2_PD(t5, t6);
383     *x1          = t1;
384     *y1          = t2;
385     *z1          = t3;
386     *x2          = t4;
387     *y2          = t5;
388     *z2          = t6;
389     t1           = _mm_loadu_pd(ptrA+6);
390     t2           = _mm_loadu_pd(ptrB+6);
391     t3           = _mm_loadu_pd(ptrA+8);
392     t4           = _mm_loadu_pd(ptrB+8);
393     t5           = _mm_loadu_pd(ptrA+10);
394     t6           = _mm_loadu_pd(ptrB+10);
395     GMX_MM_TRANSPOSE2_PD(t1, t2);
396     GMX_MM_TRANSPOSE2_PD(t3, t4);
397     GMX_MM_TRANSPOSE2_PD(t5, t6);
398     *x3          = t1;
399     *y3          = t2;
400     *z3          = t3;
401     *x4          = t4;
402     *y4          = t5;
403     *z4          = t6;
404 }
405
406
407 /* Routines to decrement rvec in memory, typically use for j particle force updates */
408 static gmx_inline void gmx_simdcall
409 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
410                                        __m128d x1, __m128d y1, __m128d z1)
411 {
412     __m128d t1, t2, t3;
413
414     t1           = _mm_load_sd(ptrA);
415     t2           = _mm_load_sd(ptrA+1);
416     t3           = _mm_load_sd(ptrA+2);
417
418     t1           = _mm_sub_sd(t1, x1);
419     t2           = _mm_sub_sd(t2, y1);
420     t3           = _mm_sub_sd(t3, z1);
421     _mm_store_sd(ptrA, t1);
422     _mm_store_sd(ptrA+1, t2);
423     _mm_store_sd(ptrA+2, t3);
424 }
425
426
427
428 static gmx_inline void gmx_simdcall
429 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
430                                        __m128d x1, __m128d y1, __m128d z1,
431                                        __m128d x2, __m128d y2, __m128d z2,
432                                        __m128d x3, __m128d y3, __m128d z3)
433 {
434     __m128d t1, t2, t3, t4, t5;
435
436     t1          = _mm_loadu_pd(ptrA);
437     t2          = _mm_loadu_pd(ptrA+2);
438     t3          = _mm_loadu_pd(ptrA+4);
439     t4          = _mm_loadu_pd(ptrA+6);
440     t5          = _mm_load_sd(ptrA+8);
441
442     x1          = _mm_unpacklo_pd(x1, y1);
443     z1          = _mm_unpacklo_pd(z1, x2);
444     y2          = _mm_unpacklo_pd(y2, z2);
445     x3          = _mm_unpacklo_pd(x3, y3);
446     /* nothing to be done for z3 */
447
448     t1          = _mm_sub_pd(t1, x1);
449     t2          = _mm_sub_pd(t2, z1);
450     t3          = _mm_sub_pd(t3, y2);
451     t4          = _mm_sub_pd(t4, x3);
452     t5          = _mm_sub_sd(t5, z3);
453     _mm_storeu_pd(ptrA, t1);
454     _mm_storeu_pd(ptrA+2, t2);
455     _mm_storeu_pd(ptrA+4, t3);
456     _mm_storeu_pd(ptrA+6, t4);
457     _mm_store_sd(ptrA+8, t5);
458 }
459
460
461 static gmx_inline void gmx_simdcall
462 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
463                                        __m128d x1, __m128d y1, __m128d z1,
464                                        __m128d x2, __m128d y2, __m128d z2,
465                                        __m128d x3, __m128d y3, __m128d z3,
466                                        __m128d x4, __m128d y4, __m128d z4)
467 {
468     __m128d t1, t2, t3, t4, t5, t6;
469
470     t1          = _mm_loadu_pd(ptrA);
471     t2          = _mm_loadu_pd(ptrA+2);
472     t3          = _mm_loadu_pd(ptrA+4);
473     t4          = _mm_loadu_pd(ptrA+6);
474     t5          = _mm_loadu_pd(ptrA+8);
475     t6          = _mm_loadu_pd(ptrA+10);
476
477     x1          = _mm_unpacklo_pd(x1, y1);
478     z1          = _mm_unpacklo_pd(z1, x2);
479     y2          = _mm_unpacklo_pd(y2, z2);
480     x3          = _mm_unpacklo_pd(x3, y3);
481     z3          = _mm_unpacklo_pd(z3, x4);
482     y4          = _mm_unpacklo_pd(y4, z4);
483
484     _mm_storeu_pd(ptrA,    _mm_sub_pd( t1, x1 ));
485     _mm_storeu_pd(ptrA+2,  _mm_sub_pd( t2, z1 ));
486     _mm_storeu_pd(ptrA+4,  _mm_sub_pd( t3, y2 ));
487     _mm_storeu_pd(ptrA+6,  _mm_sub_pd( t4, x3 ));
488     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5, z3 ));
489     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
490 }
491
492
493 static gmx_inline void gmx_simdcall
494 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
495                                        __m128d x1, __m128d y1, __m128d z1)
496 {
497     __m128d t1, t2, t3, t4, t5, t6, t7;
498
499     t1          = _mm_loadu_pd(ptrA);
500     t2          = _mm_load_sd(ptrA+2);
501     t3          = _mm_loadu_pd(ptrB);
502     t4          = _mm_load_sd(ptrB+2);
503
504     t5          = _mm_unpacklo_pd(x1, y1);
505     t6          = _mm_unpackhi_pd(x1, y1);
506     t7          = _mm_unpackhi_pd(z1, z1);
507
508     t1          = _mm_sub_pd(t1, t5);
509     t2          = _mm_sub_sd(t2, z1);
510
511     t3          = _mm_sub_pd(t3, t6);
512     t4          = _mm_sub_sd(t4, t7);
513
514     _mm_storeu_pd(ptrA, t1);
515     _mm_store_sd(ptrA+2, t2);
516     _mm_storeu_pd(ptrB, t3);
517     _mm_store_sd(ptrB+2, t4);
518 }
519
520
521
522 static gmx_inline void gmx_simdcall
523 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
524                                        __m128d x1, __m128d y1, __m128d z1,
525                                        __m128d x2, __m128d y2, __m128d z2,
526                                        __m128d x3, __m128d y3, __m128d z3)
527 {
528     __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
529     __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI;
530
531     t1          = _mm_loadu_pd(ptrA);
532     t2          = _mm_loadu_pd(ptrA+2);
533     t3          = _mm_loadu_pd(ptrA+4);
534     t4          = _mm_loadu_pd(ptrA+6);
535     t5          = _mm_load_sd(ptrA+8);
536     t6          = _mm_loadu_pd(ptrB);
537     t7          = _mm_loadu_pd(ptrB+2);
538     t8          = _mm_loadu_pd(ptrB+4);
539     t9          = _mm_loadu_pd(ptrB+6);
540     t10         = _mm_load_sd(ptrB+8);
541
542     tA          = _mm_unpacklo_pd(x1, y1);
543     tB          = _mm_unpackhi_pd(x1, y1);
544     tC          = _mm_unpacklo_pd(z1, x2);
545     tD          = _mm_unpackhi_pd(z1, x2);
546     tE          = _mm_unpacklo_pd(y2, z2);
547     tF          = _mm_unpackhi_pd(y2, z2);
548     tG          = _mm_unpacklo_pd(x3, y3);
549     tH          = _mm_unpackhi_pd(x3, y3);
550     tI          = _mm_unpackhi_pd(z3, z3);
551
552     t1          = _mm_sub_pd(t1, tA);
553     t2          = _mm_sub_pd(t2, tC);
554     t3          = _mm_sub_pd(t3, tE);
555     t4          = _mm_sub_pd(t4, tG);
556     t5          = _mm_sub_sd(t5, z3);
557
558     t6          = _mm_sub_pd(t6, tB);
559     t7          = _mm_sub_pd(t7, tD);
560     t8          = _mm_sub_pd(t8, tF);
561     t9          = _mm_sub_pd(t9, tH);
562     t10         = _mm_sub_sd(t10, tI);
563
564     _mm_storeu_pd(ptrA, t1);
565     _mm_storeu_pd(ptrA+2, t2);
566     _mm_storeu_pd(ptrA+4, t3);
567     _mm_storeu_pd(ptrA+6, t4);
568     _mm_store_sd(ptrA+8, t5);
569     _mm_storeu_pd(ptrB, t6);
570     _mm_storeu_pd(ptrB+2, t7);
571     _mm_storeu_pd(ptrB+4, t8);
572     _mm_storeu_pd(ptrB+6, t9);
573     _mm_store_sd(ptrB+8, t10);
574 }
575
576
577 static gmx_inline void gmx_simdcall
578 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
579                                        __m128d x1, __m128d y1, __m128d z1,
580                                        __m128d x2, __m128d y2, __m128d z2,
581                                        __m128d x3, __m128d y3, __m128d z3,
582                                        __m128d x4, __m128d y4, __m128d z4)
583 {
584     __m128d t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
585     __m128d tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
586
587     t1          = _mm_loadu_pd(ptrA);
588     t2          = _mm_loadu_pd(ptrA+2);
589     t3          = _mm_loadu_pd(ptrA+4);
590     t4          = _mm_loadu_pd(ptrA+6);
591     t5          = _mm_loadu_pd(ptrA+8);
592     t6          = _mm_loadu_pd(ptrA+10);
593     t7          = _mm_loadu_pd(ptrB);
594     t8          = _mm_loadu_pd(ptrB+2);
595     t9          = _mm_loadu_pd(ptrB+4);
596     t10         = _mm_loadu_pd(ptrB+6);
597     t11         = _mm_loadu_pd(ptrB+8);
598     t12         = _mm_loadu_pd(ptrB+10);
599
600     tA          = _mm_unpacklo_pd(x1, y1);
601     tB          = _mm_unpackhi_pd(x1, y1);
602     tC          = _mm_unpacklo_pd(z1, x2);
603     tD          = _mm_unpackhi_pd(z1, x2);
604     tE          = _mm_unpacklo_pd(y2, z2);
605     tF          = _mm_unpackhi_pd(y2, z2);
606     tG          = _mm_unpacklo_pd(x3, y3);
607     tH          = _mm_unpackhi_pd(x3, y3);
608     tI          = _mm_unpacklo_pd(z3, x4);
609     tJ          = _mm_unpackhi_pd(z3, x4);
610     tK          = _mm_unpacklo_pd(y4, z4);
611     tL          = _mm_unpackhi_pd(y4, z4);
612
613     t1          = _mm_sub_pd(t1, tA);
614     t2          = _mm_sub_pd(t2, tC);
615     t3          = _mm_sub_pd(t3, tE);
616     t4          = _mm_sub_pd(t4, tG);
617     t5          = _mm_sub_pd(t5, tI);
618     t6          = _mm_sub_pd(t6, tK);
619
620     t7          = _mm_sub_pd(t7, tB);
621     t8          = _mm_sub_pd(t8, tD);
622     t9          = _mm_sub_pd(t9, tF);
623     t10         = _mm_sub_pd(t10, tH);
624     t11         = _mm_sub_pd(t11, tJ);
625     t12         = _mm_sub_pd(t12, tL);
626
627     _mm_storeu_pd(ptrA,  t1);
628     _mm_storeu_pd(ptrA+2, t2);
629     _mm_storeu_pd(ptrA+4, t3);
630     _mm_storeu_pd(ptrA+6, t4);
631     _mm_storeu_pd(ptrA+8, t5);
632     _mm_storeu_pd(ptrA+10, t6);
633     _mm_storeu_pd(ptrB,  t7);
634     _mm_storeu_pd(ptrB+2, t8);
635     _mm_storeu_pd(ptrB+4, t9);
636     _mm_storeu_pd(ptrB+6, t10);
637     _mm_storeu_pd(ptrB+8, t11);
638     _mm_storeu_pd(ptrB+10, t12);
639 }
640
641
642 static gmx_inline void gmx_simdcall
643 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
644                                       double * gmx_restrict fptr,
645                                       double * gmx_restrict fshiftptr)
646 {
647     fix1 = _mm_hadd_pd(fix1, fiy1);
648     fiz1 = _mm_hadd_pd(fiz1, fiz1);
649
650     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
651     _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
652
653     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
654     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
655 }
656
657
658 static gmx_inline void gmx_simdcall
659 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
660                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
661                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
662                                       double * gmx_restrict fptr,
663                                       double * gmx_restrict fshiftptr)
664 {
665     __m128d t1, t2;
666
667     fix1 = _mm_hadd_pd(fix1, fiy1);
668     fiz1 = _mm_hadd_pd(fiz1, fix2);
669     fiy2 = _mm_hadd_pd(fiy2, fiz2);
670     fix3 = _mm_hadd_pd(fix3, fiy3);
671     fiz3 = _mm_hadd_pd(fiz3, fiz3);
672
673     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
674     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
675     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
676     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
677     _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
678
679     fix1 = _mm_add_pd(fix1, fix3);
680     t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
681     fix1 = _mm_add_pd(fix1, t1); /* x and y sums */
682
683     t2   = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1));
684     fiz1 = _mm_add_sd(fiz1, fiz3);
685     fiz1 = _mm_add_sd(fiz1, t2); /* z sum */
686
687     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
688     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
689 }
690
691
692 static gmx_inline void gmx_simdcall
693 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
694                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
695                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
696                                       __m128d fix4, __m128d fiy4, __m128d fiz4,
697                                       double * gmx_restrict fptr,
698                                       double * gmx_restrict fshiftptr)
699 {
700     __m128d t1, t2;
701
702     fix1 = _mm_hadd_pd(fix1, fiy1);
703     fiz1 = _mm_hadd_pd(fiz1, fix2);
704     fiy2 = _mm_hadd_pd(fiy2, fiz2);
705     fix3 = _mm_hadd_pd(fix3, fiy3);
706     fiz3 = _mm_hadd_pd(fiz3, fix4);
707     fiy4 = _mm_hadd_pd(fiy4, fiz4);
708
709     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
710     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
711     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
712     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));
713     _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));
714     _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
715
716     t1   = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1));
717     fix1 = _mm_add_pd(fix1, t1);
718     t2   = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1));
719     fix3 = _mm_add_pd(fix3, t2);
720     fix1 = _mm_add_pd(fix1, fix3); /* x and y sums */
721
722     fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2));
723     fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4));
724     fiz1 = _mm_add_sd(fiz1, fiz3); /* z sum */
725
726     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
727     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
728 }
729
730
731 static gmx_inline void gmx_simdcall
732 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
733 {
734     pot1 = _mm_hadd_pd(pot1, pot1);
735     _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
736 }
737
738 static gmx_inline void gmx_simdcall
739 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
740                       __m128d pot2, double * gmx_restrict ptrB)
741 {
742     pot1 = _mm_hadd_pd(pot1, pot2);
743     pot2 = _mm_unpackhi_pd(pot1, pot1);
744
745     _mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
746     _mm_store_sd(ptrB, _mm_add_sd(pot2, _mm_load_sd(ptrB)));
747 }
748
749
750 #endif /* _kernelutil_x86_avx_128_fma_double_h_ */