Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / kernelutil_x86_sse2_double.h
1 /*
2  *                This source code is part of
3  *
4  *                 G   R   O   M   A   C   S
5  *
6  * Copyright (c) 2011-2012, The GROMACS Development Team
7  *
8  * Gromacs is a library for molecular simulation and trajectory analysis,
9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10  * a full list of developers and information, check out http://www.gromacs.org
11  *
12  * This program is free software; you can redistribute it and/or modify it under
13  * the terms of the GNU Lesser General Public License as published by the Free
14  * Software Foundation; either version 2 of the License, or (at your option) any
15  * later version.
16  * As a special exception, you may use this file as part of a free software
17  * library without restriction.  Specifically, if other files instantiate
18  * templates or use macros or inline functions from this file, or you compile
19  * this file and link it with other files to produce an executable, this
20  * file does not by itself cause the resulting executable to be covered by
21  * the GNU Lesser General Public License.
22  *
23  * In plain-speak: do not worry about classes/macros/templates either - only
24  * changes to the library have to be LGPL, not an application linking with it.
25  *
26  * To help fund GROMACS development, we humbly ask that you cite
27  * the papers people have written on it - you can find them on the website!
28  */
29 #ifndef _kernelutil_x86_sse2_double_h_
30 #define _kernelutil_x86_sse2_double_h_
31
32 #include <math.h>
33
34 #include "gmx_x86_sse2.h"
35
36 #include <stdio.h>
37
38
39 /* Normal sum of four ymm registers */
40 #define gmx_mm_sum4_pd(t0,t1,t2,t3)  _mm_add_pd(_mm_add_pd(t0,t1),_mm_add_pd(t2,t3))
41
42 static int
43 gmx_mm_any_lt(__m128d a, __m128d b)
44 {
45     return _mm_movemask_pd(_mm_cmplt_pd(a,b));
46 }
47
48 static gmx_inline __m128d
49 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
50 {
51     return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx), _mm_mul_pd(dy,dy) ), _mm_mul_pd(dz,dz) );
52 }
53
54
55 /* Load a double value from 1-2 places, merge into xmm register */
56 static gmx_inline __m128d
57 gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
58                              const double * gmx_restrict ptrB)
59 {
60     return _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB));
61 }
62
63 static gmx_inline __m128d
64 gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
65 {
66     return _mm_load_sd(ptrA);
67 }
68
69
70 static gmx_inline void
71 gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
72                               double * gmx_restrict ptrB,
73                               __m128d xmm1)
74 {
75     __m128d t2;
76
77     t2       = _mm_unpackhi_pd(xmm1,xmm1);
78     _mm_store_sd(ptrA,xmm1);
79     _mm_store_sd(ptrB,t2);
80 }
81
82 static gmx_inline void
83 gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
84 {
85     _mm_store_sd(ptrA,xmm1);
86 }
87
88
89 /* Similar to store, but increments value in memory */
90 static gmx_inline void
91 gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
92                                   double * gmx_restrict ptrB, __m128d xmm1)
93 {
94     __m128d t1;
95
96     t1   = _mm_unpackhi_pd(xmm1,xmm1);
97     xmm1 = _mm_add_sd(xmm1,_mm_load_sd(ptrA));
98     t1   = _mm_add_sd(t1,_mm_load_sd(ptrB));
99     _mm_store_sd(ptrA,xmm1);
100     _mm_store_sd(ptrB,t1);
101 }
102
103 static gmx_inline void
104 gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
105 {
106     __m128d tmp;
107
108     tmp = gmx_mm_load_1real_pd(ptrA);
109     tmp = _mm_add_sd(tmp,xmm1);
110     gmx_mm_store_1real_pd(ptrA,tmp);
111 }
112
113
114 static gmx_inline void
115 gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
116                              const double * gmx_restrict p2,
117                              __m128d * gmx_restrict c6,
118                              __m128d * gmx_restrict c12)
119 {
120     __m128d t1,t2,t3;
121
122     t1   = _mm_loadu_pd(p1);
123     t2   = _mm_loadu_pd(p2);
124     *c6  = _mm_unpacklo_pd(t1,t2);
125     *c12 = _mm_unpackhi_pd(t1,t2);
126 }
127
128 static gmx_inline void
129 gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
130                              __m128d * gmx_restrict c6,
131                              __m128d * gmx_restrict c12)
132 {
133     *c6     = _mm_load_sd(p1);
134     *c12    = _mm_load_sd(p1+1);
135 }
136
137
138
139 static gmx_inline void
140 gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
141         const double * gmx_restrict xyz,
142         __m128d * gmx_restrict x1,
143         __m128d * gmx_restrict y1,
144         __m128d * gmx_restrict z1)
145 {
146     __m128d mem_xy,mem_z,mem_sxy,mem_sz;
147
148     mem_xy  = _mm_loadu_pd(xyz);
149     mem_z   = _mm_load_sd(xyz+2);
150     mem_sxy = _mm_loadu_pd(xyz_shift);
151     mem_sz  = _mm_load_sd(xyz_shift+2);
152
153     mem_xy  = _mm_add_pd(mem_xy,mem_sxy);
154     mem_z   = _mm_add_pd(mem_z,mem_sz);
155
156     *x1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0));
157     *y1  = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1));
158     *z1  = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0));
159 }
160
161
162 static gmx_inline void
163 gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
164         const double * gmx_restrict xyz,
165         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
166         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
167         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
168 {
169     __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz;
170
171     t1  = _mm_loadu_pd(xyz);
172     t2  = _mm_loadu_pd(xyz+2);
173     t3  = _mm_loadu_pd(xyz+4);
174     t4  = _mm_loadu_pd(xyz+6);
175     t5  = _mm_load_sd(xyz+8);
176
177     sxy = _mm_loadu_pd(xyz_shift);
178     sz  = _mm_load_sd(xyz_shift+2);
179     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
180     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
181
182     t1  = _mm_add_pd(t1,sxy);
183     t2  = _mm_add_pd(t2,szx);
184     t3  = _mm_add_pd(t3,syz);
185     t4  = _mm_add_pd(t4,sxy);
186     t5  = _mm_add_sd(t5,sz);
187
188     *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
189     *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
190     *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
191     *x2  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
192     *y2  = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
193     *z2  = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
194     *x3  = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
195     *y3  = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
196     *z3  = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
197 }
198
199
200 static gmx_inline void
201 gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
202         const double * gmx_restrict xyz,
203         __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
204         __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
205         __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
206         __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
207 {
208     __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
209
210     t1  = _mm_loadu_pd(xyz);
211     t2  = _mm_loadu_pd(xyz+2);
212     t3  = _mm_loadu_pd(xyz+4);
213     t4  = _mm_loadu_pd(xyz+6);
214     t5  = _mm_loadu_pd(xyz+8);
215     t6  = _mm_loadu_pd(xyz+10);
216
217     sxy = _mm_loadu_pd(xyz_shift);
218     sz  = _mm_load_sd(xyz_shift+2);
219     szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0));
220     syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1));
221
222     t1  = _mm_add_pd(t1,sxy);
223     t2  = _mm_add_pd(t2,szx);
224     t3  = _mm_add_pd(t3,syz);
225     t4  = _mm_add_pd(t4,sxy);
226     t5  = _mm_add_pd(t5,szx);
227     t6  = _mm_add_pd(t6,syz);
228
229     *x1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0));
230     *y1  = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1));
231     *z1  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0));
232     *x2  = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1));
233     *y2  = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0));
234     *z2  = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1));
235     *x3  = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0));
236     *y3  = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1));
237     *z3  = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0));
238     *x4  = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1));
239     *y4  = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0));
240     *z4  = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1));
241 }
242
243
244
245
246 static gmx_inline void
247 gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
248                                   __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
249 {
250     *x            = _mm_load_sd(p1);
251     *y            = _mm_load_sd(p1+1);
252     *z            = _mm_load_sd(p1+2);
253 }
254
255 static gmx_inline void
256 gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
257                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
258                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
259                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
260 {
261     *x1            = _mm_load_sd(p1);
262     *y1            = _mm_load_sd(p1+1);
263     *z1            = _mm_load_sd(p1+2);
264     *x2            = _mm_load_sd(p1+3);
265     *y2            = _mm_load_sd(p1+4);
266     *z2            = _mm_load_sd(p1+5);
267     *x3            = _mm_load_sd(p1+6);
268     *y3            = _mm_load_sd(p1+7);
269     *z3            = _mm_load_sd(p1+8);
270 }
271
272 static gmx_inline void
273 gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
274                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
275                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
276                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
277                                   __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
278 {
279     *x1            = _mm_load_sd(p1);
280     *y1            = _mm_load_sd(p1+1);
281     *z1            = _mm_load_sd(p1+2);
282     *x2            = _mm_load_sd(p1+3);
283     *y2            = _mm_load_sd(p1+4);
284     *z2            = _mm_load_sd(p1+5);
285     *x3            = _mm_load_sd(p1+6);
286     *y3            = _mm_load_sd(p1+7);
287     *z3            = _mm_load_sd(p1+8);
288     *x4            = _mm_load_sd(p1+9);
289     *y4            = _mm_load_sd(p1+10);
290     *z4            = _mm_load_sd(p1+11);
291 }
292
293
294 static gmx_inline void
295 gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
296                                   const double * gmx_restrict ptrB,
297                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
298 {
299     __m128d t1,t2,t3,t4;
300     t1           = _mm_loadu_pd(ptrA);
301     t2           = _mm_loadu_pd(ptrB);
302     t3           = _mm_load_sd(ptrA+2);
303     t4           = _mm_load_sd(ptrB+2);
304     GMX_MM_TRANSPOSE2_PD(t1,t2);
305     *x1          = t1;
306     *y1          = t2;
307     *z1          = _mm_unpacklo_pd(t3,t4);
308 }
309
310
311 static gmx_inline void
312 gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
313                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
314                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
315                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3)
316 {
317     __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
318     t1           = _mm_loadu_pd(ptrA);
319     t2           = _mm_loadu_pd(ptrB);
320     t3           = _mm_loadu_pd(ptrA+2);
321     t4           = _mm_loadu_pd(ptrB+2);
322     t5           = _mm_loadu_pd(ptrA+4);
323     t6           = _mm_loadu_pd(ptrB+4);
324     t7           = _mm_loadu_pd(ptrA+6);
325     t8           = _mm_loadu_pd(ptrB+6);
326     t9           = _mm_load_sd(ptrA+8);
327     t10          = _mm_load_sd(ptrB+8);
328     GMX_MM_TRANSPOSE2_PD(t1,t2);
329     GMX_MM_TRANSPOSE2_PD(t3,t4);
330     GMX_MM_TRANSPOSE2_PD(t5,t6);
331     GMX_MM_TRANSPOSE2_PD(t7,t8);
332     *x1          = t1;
333     *y1          = t2;
334     *z1          = t3;
335     *x2          = t4;
336     *y2          = t5;
337     *z2          = t6;
338     *x3          = t7;
339     *y3          = t8;
340     *z3          = _mm_unpacklo_pd(t9,t10);
341 }
342
343
344 static gmx_inline void
345 gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
346                                   __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
347                                   __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
348                                   __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3,
349                                   __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4)
350 {
351     __m128d t1,t2,t3,t4,t5,t6;
352     t1           = _mm_loadu_pd(ptrA);
353     t2           = _mm_loadu_pd(ptrB);
354     t3           = _mm_loadu_pd(ptrA+2);
355     t4           = _mm_loadu_pd(ptrB+2);
356     t5           = _mm_loadu_pd(ptrA+4);
357     t6           = _mm_loadu_pd(ptrB+4);
358     GMX_MM_TRANSPOSE2_PD(t1,t2);
359     GMX_MM_TRANSPOSE2_PD(t3,t4);
360     GMX_MM_TRANSPOSE2_PD(t5,t6);
361     *x1          = t1;
362     *y1          = t2;
363     *z1          = t3;
364     *x2          = t4;
365     *y2          = t5;
366     *z2          = t6;
367     t1           = _mm_loadu_pd(ptrA+6);
368     t2           = _mm_loadu_pd(ptrB+6);
369     t3           = _mm_loadu_pd(ptrA+8);
370     t4           = _mm_loadu_pd(ptrB+8);
371     t5           = _mm_loadu_pd(ptrA+10);
372     t6           = _mm_loadu_pd(ptrB+10);
373     GMX_MM_TRANSPOSE2_PD(t1,t2);
374     GMX_MM_TRANSPOSE2_PD(t3,t4);
375     GMX_MM_TRANSPOSE2_PD(t5,t6);
376     *x3          = t1;
377     *y3          = t2;
378     *z3          = t3;
379     *x4          = t4;
380     *y4          = t5;
381     *z4          = t6;
382 }
383
384
385 /* Routines to decrement rvec in memory, typically use for j particle force updates */
386 static gmx_inline void
387 gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
388         __m128d xy, __m128d z)
389 {
390     __m128d t1,t2;
391
392     t1 = _mm_loadu_pd(ptrA);
393     t2 = _mm_load_sd(ptrA+2);
394
395     t1 = _mm_sub_pd(t1,xy);
396     t2 = _mm_sub_sd(t2,z);
397
398     _mm_storeu_pd(ptrA,t1);
399     _mm_store_sd(ptrA+2,t2);
400 }
401
402
403 static gmx_inline void
404 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
405                                        __m128d x1, __m128d y1, __m128d z1)
406 {
407     __m128d t1,t2,t3;
408
409     t1           = _mm_load_sd(ptrA);
410     t2           = _mm_load_sd(ptrA+1);
411     t3           = _mm_load_sd(ptrA+2);
412
413     t1           = _mm_sub_sd(t1,x1);
414     t2           = _mm_sub_sd(t2,y1);
415     t3           = _mm_sub_sd(t3,z1);
416     _mm_store_sd(ptrA,t1);
417     _mm_store_sd(ptrA+1,t2);
418     _mm_store_sd(ptrA+2,t3);
419 }
420
421
422 #if defined (_MSC_VER) && defined(_M_IX86)
423 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
424 #define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
425 {\
426     __m128d _t1,_t2,_t3,_t4,_t5;\
427     _t1          = _mm_loadu_pd(ptrA);\
428     _t2          = _mm_loadu_pd(ptrA+2);\
429     _t3          = _mm_loadu_pd(ptrA+4);\
430     _t4          = _mm_loadu_pd(ptrA+6);\
431     _t5          = _mm_load_sd(ptrA+8);\
432     _x1          = _mm_unpacklo_pd(_x1,_y1);\
433     _z1          = _mm_unpacklo_pd(_z1,_x2);\
434     _y2          = _mm_unpacklo_pd(_y2,_z2);\
435     _x3          = _mm_unpacklo_pd(_x3,_y3);\
436     _t1          = _mm_sub_pd(_t1,_x1);\
437     _t2          = _mm_sub_pd(_t2,_z1);\
438     _t3          = _mm_sub_pd(_t3,_y2);\
439     _t4          = _mm_sub_pd(_t4,_x3);\
440     _t5          = _mm_sub_sd(_t5,_z3);\
441     _mm_storeu_pd(ptrA,_t1);\
442     _mm_storeu_pd(ptrA+2,_t2);\
443     _mm_storeu_pd(ptrA+4,_t3);\
444     _mm_storeu_pd(ptrA+6,_t4);\
445     _mm_store_sd(ptrA+8,_t5);\
446 }
447 #else
448 /* Real function for sane compilers */
449 static gmx_inline void
450 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
451                                        __m128d x1, __m128d y1, __m128d z1,
452                                        __m128d x2, __m128d y2, __m128d z2,
453                                        __m128d x3, __m128d y3, __m128d z3)
454 {
455     __m128d t1,t2,t3,t4,t5;
456
457     t1          = _mm_loadu_pd(ptrA);
458     t2          = _mm_loadu_pd(ptrA+2);
459     t3          = _mm_loadu_pd(ptrA+4);
460     t4          = _mm_loadu_pd(ptrA+6);
461     t5          = _mm_load_sd(ptrA+8);
462
463     x1          = _mm_unpacklo_pd(x1,y1);
464     z1          = _mm_unpacklo_pd(z1,x2);
465     y2          = _mm_unpacklo_pd(y2,z2);
466     x3          = _mm_unpacklo_pd(x3,y3);
467     /* nothing to be done for z3 */
468
469     t1          = _mm_sub_pd(t1,x1);
470     t2          = _mm_sub_pd(t2,z1);
471     t3          = _mm_sub_pd(t3,y2);
472     t4          = _mm_sub_pd(t4,x3);
473     t5          = _mm_sub_sd(t5,z3);
474     _mm_storeu_pd(ptrA,t1);
475     _mm_storeu_pd(ptrA+2,t2);
476     _mm_storeu_pd(ptrA+4,t3);
477     _mm_storeu_pd(ptrA+6,t4);
478     _mm_store_sd(ptrA+8,t5);
479 }
480 #endif
481
482
483 #if defined (_MSC_VER) && defined(_M_IX86)
484 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
485 #define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
486 {\
487     __m128d _t1,_t2,_t3,_t4,_t5,_t6;\
488     _t1          = _mm_loadu_pd(ptrA);\
489     _t2          = _mm_loadu_pd(ptrA+2);\
490     _t3          = _mm_loadu_pd(ptrA+4);\
491     _t4          = _mm_loadu_pd(ptrA+6);\
492     _t5          = _mm_loadu_pd(ptrA+8);\
493     _t6          = _mm_loadu_pd(ptrA+10);\
494     _x1          = _mm_unpacklo_pd(_x1,_y1);\
495     _z1          = _mm_unpacklo_pd(_z1,_x2);\
496     _y2          = _mm_unpacklo_pd(_y2,_z2);\
497     _x3          = _mm_unpacklo_pd(_x3,_y3);\
498     _z3          = _mm_unpacklo_pd(_z3,_x4);\
499     _y4          = _mm_unpacklo_pd(_y4,_z4);\
500     _mm_storeu_pd(ptrA,    _mm_sub_pd( _t1,_x1 ));\
501     _mm_storeu_pd(ptrA+2,  _mm_sub_pd( _t2,_z1 ));\
502     _mm_storeu_pd(ptrA+4,  _mm_sub_pd( _t3,_y2 ));\
503     _mm_storeu_pd(ptrA+6,  _mm_sub_pd( _t4,_x3 ));\
504     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( _t5,_z3 ));\
505     _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6,_y4 ));\
506 }
507 #else
508 /* Real function for sane compilers */
509 static gmx_inline void
510 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
511                                        __m128d x1, __m128d y1, __m128d z1,
512                                        __m128d x2, __m128d y2, __m128d z2,
513                                        __m128d x3, __m128d y3, __m128d z3,
514                                        __m128d x4, __m128d y4, __m128d z4)
515 {
516     __m128d t1,t2,t3,t4,t5,t6;
517
518     t1          = _mm_loadu_pd(ptrA);
519     t2          = _mm_loadu_pd(ptrA+2);
520     t3          = _mm_loadu_pd(ptrA+4);
521     t4          = _mm_loadu_pd(ptrA+6);
522     t5          = _mm_loadu_pd(ptrA+8);
523     t6          = _mm_loadu_pd(ptrA+10);
524
525     x1          = _mm_unpacklo_pd(x1,y1);
526     z1          = _mm_unpacklo_pd(z1,x2);
527     y2          = _mm_unpacklo_pd(y2,z2);
528     x3          = _mm_unpacklo_pd(x3,y3);
529     z3          = _mm_unpacklo_pd(z3,x4);
530     y4          = _mm_unpacklo_pd(y4,z4);
531
532     _mm_storeu_pd(ptrA,    _mm_sub_pd( t1,x1 ));
533     _mm_storeu_pd(ptrA+2,  _mm_sub_pd( t2,z1 ));
534     _mm_storeu_pd(ptrA+4,  _mm_sub_pd( t3,y2 ));
535     _mm_storeu_pd(ptrA+6,  _mm_sub_pd( t4,x3 ));
536     _mm_storeu_pd(ptrA+8,  _mm_sub_pd( t5,z3 ));
537     _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 ));
538 }
539 #endif
540
541
542 static gmx_inline void
543 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
544                                        __m128d x1, __m128d y1, __m128d z1)
545 {
546     __m128d t1,t2,t3,t4,t5,t6,t7;
547
548     t1          = _mm_loadu_pd(ptrA);
549     t2          = _mm_load_sd(ptrA+2);
550     t3          = _mm_loadu_pd(ptrB);
551     t4          = _mm_load_sd(ptrB+2);
552
553     t5          = _mm_unpacklo_pd(x1,y1);
554     t6          = _mm_unpackhi_pd(x1,y1);
555     t7          = _mm_unpackhi_pd(z1,z1);
556
557     t1          = _mm_sub_pd(t1,t5);
558     t2          = _mm_sub_sd(t2,z1);
559
560     t3          = _mm_sub_pd(t3,t6);
561     t4          = _mm_sub_sd(t4,t7);
562
563     _mm_storeu_pd(ptrA,t1);
564     _mm_store_sd(ptrA+2,t2);
565     _mm_storeu_pd(ptrB,t3);
566     _mm_store_sd(ptrB+2,t4);
567 }
568
569 #if defined (_MSC_VER) && defined(_M_IX86)
570 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
571 #define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
572 {\
573     __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
574     __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI;\
575     _t1          = _mm_loadu_pd(ptrA);\
576     _t2          = _mm_loadu_pd(ptrA+2);\
577     _t3          = _mm_loadu_pd(ptrA+4);\
578     _t4          = _mm_loadu_pd(ptrA+6);\
579     _t5          = _mm_load_sd(ptrA+8);\
580     _t6          = _mm_loadu_pd(ptrB);\
581     _t7          = _mm_loadu_pd(ptrB+2);\
582     _t8          = _mm_loadu_pd(ptrB+4);\
583     _t9          = _mm_loadu_pd(ptrB+6);\
584     _t10         = _mm_load_sd(ptrB+8);\
585     _tA          = _mm_unpacklo_pd(_x1,_y1);\
586     _tB          = _mm_unpackhi_pd(_x1,_y1);\
587     _tC          = _mm_unpacklo_pd(_z1,_x2);\
588     _tD          = _mm_unpackhi_pd(_z1,_x2);\
589     _tE          = _mm_unpacklo_pd(_y2,_z2);\
590     _tF          = _mm_unpackhi_pd(_y2,_z2);\
591     _tG          = _mm_unpacklo_pd(_x3,_y3);\
592     _tH          = _mm_unpackhi_pd(_x3,_y3);\
593     _tI          = _mm_unpackhi_pd(_z3,_z3);\
594     _t1          = _mm_sub_pd(_t1,_tA);\
595     _t2          = _mm_sub_pd(_t2,_tC);\
596     _t3          = _mm_sub_pd(_t3,_tE);\
597     _t4          = _mm_sub_pd(_t4,_tG);\
598     _t5          = _mm_sub_sd(_t5,_z3);\
599     _t6          = _mm_sub_pd(_t6,_tB);\
600     _t7          = _mm_sub_pd(_t7,_tD);\
601     _t8          = _mm_sub_pd(_t8,_tF);\
602     _t9          = _mm_sub_pd(_t9,_tH);\
603     _t10         = _mm_sub_sd(_t10,_tI);\
604     _mm_storeu_pd(ptrA,_t1);\
605     _mm_storeu_pd(ptrA+2,_t2);\
606     _mm_storeu_pd(ptrA+4,_t3);\
607     _mm_storeu_pd(ptrA+6,_t4);\
608     _mm_store_sd(ptrA+8,_t5);\
609     _mm_storeu_pd(ptrB,_t6);\
610     _mm_storeu_pd(ptrB+2,_t7);\
611     _mm_storeu_pd(ptrB+4,_t8);\
612     _mm_storeu_pd(ptrB+6,_t9);\
613     _mm_store_sd(ptrB+8,_t10);\
614 }
615 #else
616 /* Real function for sane compilers */
617 static gmx_inline void
618 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
619                                        __m128d x1, __m128d y1, __m128d z1,
620                                        __m128d x2, __m128d y2, __m128d z2,
621                                        __m128d x3, __m128d y3, __m128d z3)
622 {
623     __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
624     __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI;
625
626     t1          = _mm_loadu_pd(ptrA);
627     t2          = _mm_loadu_pd(ptrA+2);
628     t3          = _mm_loadu_pd(ptrA+4);
629     t4          = _mm_loadu_pd(ptrA+6);
630     t5          = _mm_load_sd(ptrA+8);
631     t6          = _mm_loadu_pd(ptrB);
632     t7          = _mm_loadu_pd(ptrB+2);
633     t8          = _mm_loadu_pd(ptrB+4);
634     t9          = _mm_loadu_pd(ptrB+6);
635     t10         = _mm_load_sd(ptrB+8);
636
637     tA          = _mm_unpacklo_pd(x1,y1);
638     tB          = _mm_unpackhi_pd(x1,y1);
639     tC          = _mm_unpacklo_pd(z1,x2);
640     tD          = _mm_unpackhi_pd(z1,x2);
641     tE          = _mm_unpacklo_pd(y2,z2);
642     tF          = _mm_unpackhi_pd(y2,z2);
643     tG          = _mm_unpacklo_pd(x3,y3);
644     tH          = _mm_unpackhi_pd(x3,y3);
645     tI          = _mm_unpackhi_pd(z3,z3);
646
647     t1          = _mm_sub_pd(t1,tA);
648     t2          = _mm_sub_pd(t2,tC);
649     t3          = _mm_sub_pd(t3,tE);
650     t4          = _mm_sub_pd(t4,tG);
651     t5          = _mm_sub_sd(t5,z3);
652
653     t6          = _mm_sub_pd(t6,tB);
654     t7          = _mm_sub_pd(t7,tD);
655     t8          = _mm_sub_pd(t8,tF);
656     t9          = _mm_sub_pd(t9,tH);
657     t10         = _mm_sub_sd(t10,tI);
658
659     _mm_storeu_pd(ptrA,t1);
660     _mm_storeu_pd(ptrA+2,t2);
661     _mm_storeu_pd(ptrA+4,t3);
662     _mm_storeu_pd(ptrA+6,t4);
663     _mm_store_sd(ptrA+8,t5);
664     _mm_storeu_pd(ptrB,t6);
665     _mm_storeu_pd(ptrB+2,t7);
666     _mm_storeu_pd(ptrB+4,t8);
667     _mm_storeu_pd(ptrB+6,t9);
668     _mm_store_sd(ptrB+8,t10);
669 }
670 #endif
671
672
673 #if defined (_MSC_VER) && defined(_M_IX86)
674 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
675 #define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA,ptrB,_x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
676 {\
677     __m128d _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12;\
678     __m128d _tA,_tB,_tC,_tD,_tE,_tF,_tG,_tH,_tI,_tJ,_tK,_tL;\
679     _t1          = _mm_loadu_pd(ptrA);\
680     _t2          = _mm_loadu_pd(ptrA+2);\
681     _t3          = _mm_loadu_pd(ptrA+4);\
682     _t4          = _mm_loadu_pd(ptrA+6);\
683     _t5          = _mm_loadu_pd(ptrA+8);\
684     _t6          = _mm_loadu_pd(ptrA+10);\
685     _t7          = _mm_loadu_pd(ptrB);\
686     _t8          = _mm_loadu_pd(ptrB+2);\
687     _t9          = _mm_loadu_pd(ptrB+4);\
688     _t10         = _mm_loadu_pd(ptrB+6);\
689     _t11         = _mm_loadu_pd(ptrB+8);\
690     _t12         = _mm_loadu_pd(ptrB+10);\
691     _tA          = _mm_unpacklo_pd(_x1,_y1);\
692     _tB          = _mm_unpackhi_pd(_x1,_y1);\
693     _tC          = _mm_unpacklo_pd(_z1,_x2);\
694     _tD          = _mm_unpackhi_pd(_z1,_x2);\
695     _tE          = _mm_unpacklo_pd(_y2,_z2);\
696     _tF          = _mm_unpackhi_pd(_y2,_z2);\
697     _tG          = _mm_unpacklo_pd(_x3,_y3);\
698     _tH          = _mm_unpackhi_pd(_x3,_y3);\
699     _tI          = _mm_unpacklo_pd(_z3,_x4);\
700     _tJ          = _mm_unpackhi_pd(_z3,_x4);\
701     _tK          = _mm_unpacklo_pd(_y4,_z4);\
702     _tL          = _mm_unpackhi_pd(_y4,_z4);\
703     _t1          = _mm_sub_pd(_t1,_tA);\
704     _t2          = _mm_sub_pd(_t2,_tC);\
705     _t3          = _mm_sub_pd(_t3,_tE);\
706     _t4          = _mm_sub_pd(_t4,_tG);\
707     _t5          = _mm_sub_pd(_t5,_tI);\
708     _t6          = _mm_sub_pd(_t6,_tK);\
709     _t7          = _mm_sub_pd(_t7,_tB);\
710     _t8          = _mm_sub_pd(_t8,_tD);\
711     _t9          = _mm_sub_pd(_t9,_tF);\
712     _t10         = _mm_sub_pd(_t10,_tH);\
713     _t11         = _mm_sub_pd(_t11,_tJ);\
714     _t12         = _mm_sub_pd(_t12,_tL);\
715     _mm_storeu_pd(ptrA,  _t1);\
716     _mm_storeu_pd(ptrA+2,_t2);\
717     _mm_storeu_pd(ptrA+4,_t3);\
718     _mm_storeu_pd(ptrA+6,_t4);\
719     _mm_storeu_pd(ptrA+8,_t5);\
720     _mm_storeu_pd(ptrA+10,_t6);\
721     _mm_storeu_pd(ptrB,  _t7);\
722     _mm_storeu_pd(ptrB+2,_t8);\
723     _mm_storeu_pd(ptrB+4,_t9);\
724     _mm_storeu_pd(ptrB+6,_t10);\
725     _mm_storeu_pd(ptrB+8,_t11);\
726     _mm_storeu_pd(ptrB+10,_t12);\
727 }
728 #else
729 /* Real function for sane compilers */
730 static gmx_inline void
731 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
732                                        __m128d x1, __m128d y1, __m128d z1,
733                                        __m128d x2, __m128d y2, __m128d z2,
734                                        __m128d x3, __m128d y3, __m128d z3,
735                                        __m128d x4, __m128d y4, __m128d z4)
736 {
737     __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
738     __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
739
740     t1          = _mm_loadu_pd(ptrA);
741     t2          = _mm_loadu_pd(ptrA+2);
742     t3          = _mm_loadu_pd(ptrA+4);
743     t4          = _mm_loadu_pd(ptrA+6);
744     t5          = _mm_loadu_pd(ptrA+8);
745     t6          = _mm_loadu_pd(ptrA+10);
746     t7          = _mm_loadu_pd(ptrB);
747     t8          = _mm_loadu_pd(ptrB+2);
748     t9          = _mm_loadu_pd(ptrB+4);
749     t10         = _mm_loadu_pd(ptrB+6);
750     t11         = _mm_loadu_pd(ptrB+8);
751     t12         = _mm_loadu_pd(ptrB+10);
752
753     tA          = _mm_unpacklo_pd(x1,y1);
754     tB          = _mm_unpackhi_pd(x1,y1);
755     tC          = _mm_unpacklo_pd(z1,x2);
756     tD          = _mm_unpackhi_pd(z1,x2);
757     tE          = _mm_unpacklo_pd(y2,z2);
758     tF          = _mm_unpackhi_pd(y2,z2);
759     tG          = _mm_unpacklo_pd(x3,y3);
760     tH          = _mm_unpackhi_pd(x3,y3);
761     tI          = _mm_unpacklo_pd(z3,x4);
762     tJ          = _mm_unpackhi_pd(z3,x4);
763     tK          = _mm_unpacklo_pd(y4,z4);
764     tL          = _mm_unpackhi_pd(y4,z4);
765
766     t1          = _mm_sub_pd(t1,tA);
767     t2          = _mm_sub_pd(t2,tC);
768     t3          = _mm_sub_pd(t3,tE);
769     t4          = _mm_sub_pd(t4,tG);
770     t5          = _mm_sub_pd(t5,tI);
771     t6          = _mm_sub_pd(t6,tK);
772
773     t7          = _mm_sub_pd(t7,tB);
774     t8          = _mm_sub_pd(t8,tD);
775     t9          = _mm_sub_pd(t9,tF);
776     t10         = _mm_sub_pd(t10,tH);
777     t11         = _mm_sub_pd(t11,tJ);
778     t12         = _mm_sub_pd(t12,tL);
779
780     _mm_storeu_pd(ptrA,  t1);
781     _mm_storeu_pd(ptrA+2,t2);
782     _mm_storeu_pd(ptrA+4,t3);
783     _mm_storeu_pd(ptrA+6,t4);
784     _mm_storeu_pd(ptrA+8,t5);
785     _mm_storeu_pd(ptrA+10,t6);
786     _mm_storeu_pd(ptrB,  t7);
787     _mm_storeu_pd(ptrB+2,t8);
788     _mm_storeu_pd(ptrB+4,t9);
789     _mm_storeu_pd(ptrB+6,t10);
790     _mm_storeu_pd(ptrB+8,t11);
791     _mm_storeu_pd(ptrB+10,t12);
792 }
793 #endif
794
795
796
797
798
799 static gmx_inline void
800 gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
801                                       double * gmx_restrict fptr,
802                                       double * gmx_restrict fshiftptr)
803 {
804     __m128d t1,t2,t3;
805
806     /* transpose data */
807     t1 = fix1;
808     fix1 = _mm_unpacklo_pd(fix1,fiy1); /* y0 x0 */
809     fiy1 = _mm_unpackhi_pd(t1,fiy1);   /* y1 x1 */
810
811     fix1 = _mm_add_pd(fix1,fiy1);
812     fiz1 = _mm_add_sd( fiz1, _mm_unpackhi_pd(fiz1,fiz1 ));
813
814     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
815     _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 ));
816
817     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
818     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
819 }
820
821 #if defined (_MSC_VER) && defined(_M_IX86)
822 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
823 #define gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
824                                               fptr,fshiftptr) \
825 {\
826     __m128d _t1,_t2;\
827     GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
828     GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
829     GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
830     _t1 = fix3;\
831     fix3 = _mm_unpacklo_pd(fix3,fiy3);\
832     fiy3 = _mm_unpackhi_pd(_t1,fiy3);\
833     fix1 = _mm_add_pd(fix1,fiy1);\
834     fiz1 = _mm_add_pd(fiz1,fix2);\
835     fiy2 = _mm_add_pd(fiy2,fiz2);\
836     fix3 = _mm_add_pd(fix3,fiy3);\
837     fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3));\
838     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));\
839     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));\
840     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));\
841     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));\
842     _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));\
843     fix1 = _mm_add_pd(fix1,fix3);\
844     _t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
845     fix1 = _mm_add_pd(fix1,_t1);\
846     _t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));\
847     fiz1 = _mm_add_sd(fiz1,fiz3);\
848     fiz1 = _mm_add_sd(fiz1,_t2);\
849     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
850     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
851 }
852 #else
853 /* Real function for sane compilers */
854 static gmx_inline void
855 gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
856                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
857                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
858                                       double * gmx_restrict fptr,
859                                       double * gmx_restrict fshiftptr)
860 {
861     __m128d t1,t2;
862
863     /* transpose data */
864     GMX_MM_TRANSPOSE2_PD(fix1,fiy1);
865     GMX_MM_TRANSPOSE2_PD(fiz1,fix2);
866     GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);
867     t1 = fix3;
868     fix3 = _mm_unpacklo_pd(fix3,fiy3); /* y0 x0 */
869     fiy3 = _mm_unpackhi_pd(t1,fiy3);   /* y1 x1 */
870
871     fix1 = _mm_add_pd(fix1,fiy1);
872     fiz1 = _mm_add_pd(fiz1,fix2);
873     fiy2 = _mm_add_pd(fiy2,fiz2);
874
875     fix3 = _mm_add_pd(fix3,fiy3);
876     fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3));
877
878     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 ));
879     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 ));
880     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 ));
881     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 ));
882     _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 ));
883
884     fix1 = _mm_add_pd(fix1,fix3);
885     t1   = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
886     fix1 = _mm_add_pd(fix1,t1); /* x and y sums */
887
888     t2   = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1));
889     fiz1 = _mm_add_sd(fiz1,fiz3);
890     fiz1 = _mm_add_sd(fiz1,t2); /* z sum */
891
892     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
893     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
894 }
895 #endif
896
897
898 #if defined (_MSC_VER) && defined(_M_IX86)
899 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
900 #define gmx_mm_update_iforce_4atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
901                                               fptr,fshiftptr) \
902 {\
903     __m128d _t1,_t2;\
904     GMX_MM_TRANSPOSE2_PD(fix1,fiy1);\
905     GMX_MM_TRANSPOSE2_PD(fiz1,fix2);\
906     GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);\
907     GMX_MM_TRANSPOSE2_PD(fix3,fiy3);\
908     GMX_MM_TRANSPOSE2_PD(fiz3,fix4);\
909     GMX_MM_TRANSPOSE2_PD(fiy4,fiz4);\
910     fix1 = _mm_add_pd(fix1,fiy1);\
911     fiz1 = _mm_add_pd(fiz1,fix2);\
912     fiy2 = _mm_add_pd(fiy2,fiz2);\
913     fix3 = _mm_add_pd(fix3,fiy3);\
914     fiz3 = _mm_add_pd(fiz3,fix4);\
915     fiy4 = _mm_add_pd(fiy4,fiz4);\
916     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));\
917     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));\
918     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));\
919     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));\
920     _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));\
921     _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));\
922     _t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));\
923     fix1 = _mm_add_pd(fix1,_t1);\
924     _t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));\
925     fix3 = _mm_add_pd(fix3,_t2);\
926     fix1 = _mm_add_pd(fix1,fix3);\
927     fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));\
928     fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));\
929     fiz1 = _mm_add_sd(fiz1,fiz3);\
930     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));\
931     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));\
932 }
933 #else
934 /* Real function for sane compilers */
935 static gmx_inline void
936 gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
937                                       __m128d fix2, __m128d fiy2, __m128d fiz2,
938                                       __m128d fix3, __m128d fiy3, __m128d fiz3,
939                                       __m128d fix4, __m128d fiy4, __m128d fiz4,
940                                       double * gmx_restrict fptr,
941                                       double * gmx_restrict fshiftptr)
942 {
943     __m128d t1,t2;
944
945     /* transpose data */
946     GMX_MM_TRANSPOSE2_PD(fix1,fiy1);
947     GMX_MM_TRANSPOSE2_PD(fiz1,fix2);
948     GMX_MM_TRANSPOSE2_PD(fiy2,fiz2);
949     GMX_MM_TRANSPOSE2_PD(fix3,fiy3);
950     GMX_MM_TRANSPOSE2_PD(fiz3,fix4);
951     GMX_MM_TRANSPOSE2_PD(fiy4,fiz4);
952
953     fix1 = _mm_add_pd(fix1,fiy1);
954     fiz1 = _mm_add_pd(fiz1,fix2);
955     fiy2 = _mm_add_pd(fiy2,fiz2);
956     fix3 = _mm_add_pd(fix3,fiy3);
957     fiz3 = _mm_add_pd(fiz3,fix4);
958     fiy4 = _mm_add_pd(fiy4,fiz4);
959
960     _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr),       fix1 ));
961     _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2),   fiz1 ));
962     _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4),   fiy2 ));
963     _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6),   fix3 ));
964     _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8),   fiz3 ));
965     _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 ));
966
967     t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1));
968     fix1 = _mm_add_pd(fix1,t1);
969     t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1));
970     fix3 = _mm_add_pd(fix3,t2);
971     fix1 = _mm_add_pd(fix1,fix3); /* x and y sums */
972
973     fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2));
974     fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4));
975     fiz1 = _mm_add_sd(fiz1,fiz3); /* z sum */
976
977     _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
978     _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
979 }
980 #endif
981
982
983 static gmx_inline void
984 gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
985 {
986     pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1,pot1));
987     _mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA)));
988 }
989
990 static gmx_inline void
991 gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
992                       __m128d pot2, double * gmx_restrict ptrB)
993 {
994     GMX_MM_TRANSPOSE2_PD(pot1,pot2);
995     pot1 = _mm_add_pd(pot1,pot2);
996     pot2 = _mm_unpackhi_pd(pot1,pot1);
997
998     _mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA)));
999     _mm_store_sd(ptrB,_mm_add_sd(pot2,_mm_load_sd(ptrB)));
1000 }
1001
1002
1003 #endif /* _kernelutil_x86_sse2_double_h_ */