Merge 'release-4-6' into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / kernelutil_x86_sse2_single.h
1 /*
2  *                This source code is part of
3  *
4  *                 G   R   O   M   A   C   S
5  *
6  * Copyright (c) 2011-2012, The GROMACS Development Team
7  *
8  * Gromacs is a library for molecular simulation and trajectory analysis,
9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10  * a full list of developers and information, check out http://www.gromacs.org
11  *
12  * This program is free software; you can redistribute it and/or modify it under 
13  * the terms of the GNU Lesser General Public License as published by the Free 
14  * Software Foundation; either version 2 of the License, or (at your option) any 
15  * later version.
16  * As a special exception, you may use this file as part of a free software
17  * library without restriction.  Specifically, if other files instantiate
18  * templates or use macros or inline functions from this file, or you compile
19  * this file and link it with other files to produce an executable, this
20  * file does not by itself cause the resulting executable to be covered by
21  * the GNU Lesser General Public License.  
22  *
23  * In plain-speak: do not worry about classes/macros/templates either - only
24  * changes to the library have to be LGPL, not an application linking with it.
25  *
26  * To help fund GROMACS development, we humbly ask that you cite
27  * the papers people have written on it - you can find them on the website!
28  */
29 #ifndef _kernelutil_x86_sse2_single_h_
30 #define _kernelutil_x86_sse2_single_h_
31
32 /* We require SSE2 now! */
33
34 #include <math.h> 
35
36 #include "gmx_x86_sse2.h"
37
38
39 /* Normal sum of four xmm registers */
40 #define gmx_mm_sum4_ps(t0,t1,t2,t3)  _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
41
42 static gmx_inline __m128
43 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
44 {
45     return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
46 }
47
48 static int
49 gmx_mm_any_lt(__m128 a, __m128 b)
50 {
51     return _mm_movemask_ps(_mm_cmplt_ps(a,b));
52 }
53
54 /* Load a single value from 1-4 places, merge into xmm register */
55
56 static __m128
57 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
58                              const float * gmx_restrict ptrB,
59                              const float * gmx_restrict ptrC,
60                              const float * gmx_restrict ptrD)
61 {
62     __m128 t1,t2;
63
64     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
65     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
66     return _mm_unpacklo_ps(t1,t2);
67 }
68
69 static void
70 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
71                               float * gmx_restrict ptrB,
72                               float * gmx_restrict ptrC,
73                               float * gmx_restrict ptrD,
74                               __m128 xmm1)
75 {
76     __m128 t2,t3,t4;
77
78     t3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
79     t2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
80     t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
81     _mm_store_ss(ptrA,xmm1);
82     _mm_store_ss(ptrB,t2);
83     _mm_store_ss(ptrC,t3);
84     _mm_store_ss(ptrD,t4);
85 }
86
87 /* Similar to store, but increments value in memory */
88 static void
89 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
90                                   float * gmx_restrict ptrB,
91                                   float * gmx_restrict ptrC,
92                                   float * gmx_restrict ptrD, __m128 xmm1)
93 {
94     __m128 tmp;
95
96     tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
97     tmp = _mm_add_ps(tmp,xmm1);
98     gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
99 }
100
101
102 static void
103 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
104                              const float * gmx_restrict p2,
105                              const float * gmx_restrict p3,
106                              const float * gmx_restrict p4,
107                              __m128 * gmx_restrict c6,
108                              __m128 * gmx_restrict c12)
109 {
110     __m128 t1,t2,t3,t4;
111
112     t1   = _mm_castpd_ps(_mm_load_sd((const double *)p1));
113     t2   = _mm_castpd_ps(_mm_load_sd((const double *)p2));
114     t3   = _mm_castpd_ps(_mm_load_sd((const double *)p3));
115     t4   = _mm_castpd_ps(_mm_load_sd((const double *)p4));
116     t1   = _mm_unpacklo_ps(t1,t2);
117     t2   = _mm_unpacklo_ps(t3,t4);
118     *c6  = _mm_movelh_ps(t1,t2);
119     *c12 = _mm_movehl_ps(t2,t1);
120 }
121
122 /* Routines to load 1-4 rvec from 4 places.
123  * We mainly use these to load coordinates. The extra routines
124  * are very efficient for the water-water loops, since we e.g.
125  * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
126  */
127
128
129 static void
130 gmx_mm_load_1rvec_broadcast_ps(float *ptrA, __m128 *x, __m128 *y, __m128 *z)
131 {
132     __m128 t1;
133
134     t1 = _mm_loadu_ps(ptrA);
135
136     *x = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
137     *y = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
138     *z = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
139 }
140
141 static void
142 gmx_mm_load_3rvec_broadcast_ps(float *ptrA,
143                                __m128 *x1, __m128 *y1, __m128 *z1,
144                                __m128 *x2, __m128 *y2, __m128 *z2,
145                                __m128 *x3, __m128 *y3, __m128 *z3)
146 {
147     __m128 t1,t2,t3;
148
149     t1 = _mm_loadu_ps(ptrA);
150     t2 = _mm_loadu_ps(ptrA+4);
151
152     *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
153     *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
154     *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
155     *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
156     *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
157     *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
158     *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
159     *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
160
161     t3  = _mm_load_ss(ptrA+8);
162     *z3 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
163 }
164
165 static void
166 gmx_mm_load_4rvec_broadcast_ps(float *ptrA,
167                                __m128 *x1, __m128 *y1, __m128 *z1,
168                                __m128 *x2, __m128 *y2, __m128 *z2,
169                                __m128 *x3, __m128 *y3, __m128 *z3,
170                                __m128 *x4, __m128 *y4, __m128 *z4)
171 {
172     __m128 t1,t2,t3;
173     __m128 tA;
174
175     t1 = _mm_loadu_ps(ptrA);
176     t2 = _mm_loadu_ps(ptrA+4);
177     t3 = _mm_loadu_ps(ptrA+8);
178
179     *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
180     *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
181     *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
182     *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
183     *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
184     *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
185     *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
186     *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
187     *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
188     *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
189     *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
190     *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
191 }
192
193
194 static void
195 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
196                                   const float * gmx_restrict ptrB,
197                                   const float * gmx_restrict ptrC,
198                                   const float * gmx_restrict ptrD,
199                                   __m128 *      gmx_restrict x1,
200                                   __m128 *      gmx_restrict y1,
201                                   __m128 *      gmx_restrict z1)
202 {
203     __m128 t1,t2,t3,t4,t5,t6,t7,t8;
204     t1   = _mm_castpd_ps(_mm_load_sd((const double *)ptrA));
205     t2   = _mm_castpd_ps(_mm_load_sd((const double *)ptrB));
206     t3   = _mm_castpd_ps(_mm_load_sd((const double *)ptrC));
207     t4   = _mm_castpd_ps(_mm_load_sd((const double *)ptrD));
208     t5 = _mm_load_ss(ptrA+2);
209     t6 = _mm_load_ss(ptrB+2);
210     t7 = _mm_load_ss(ptrC+2);
211     t8 = _mm_load_ss(ptrD+2);
212     t1 = _mm_unpacklo_ps(t1,t2);
213     t3 = _mm_unpacklo_ps(t3,t4);
214     *x1 = _mm_movelh_ps(t1,t3);
215     *y1 = _mm_movehl_ps(t3,t1);
216     t5  = _mm_unpacklo_ps(t5,t6);
217     t7  = _mm_unpacklo_ps(t7,t8);
218     *z1 = _mm_movelh_ps(t5,t7);
219 }
220
221
222 static void
223 gmx_mm_load_2rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
224                                   __m128 *x1, __m128 *y1, __m128 *z1,
225                                   __m128 *x2, __m128 *y2, __m128 *z2)
226 {
227     __m128 t1,t2,t3,t4;
228     t1            = _mm_loadu_ps(ptrA);
229     t2            = _mm_loadu_ps(ptrB);
230     t3            = _mm_loadu_ps(ptrC);
231     t4            = _mm_loadu_ps(ptrD);
232     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
233     *x1           = t1;
234     *y1           = t2;
235     *z1           = t3;
236     *x2           = t4;
237     t1            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrA+4));
238     t2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrB+4));
239     t3            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrC+4));
240     t4            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrD+4));
241     t1            = _mm_unpacklo_ps(t1,t3);
242     t2            = _mm_unpacklo_ps(t2,t4);
243     *y2           = _mm_unpacklo_ps(t1,t2);
244     *z2           = _mm_unpackhi_ps(t1,t2);
245 }
246
247
248 static void
249 gmx_mm_load_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
250                                   __m128 *x1, __m128 *y1, __m128 *z1,
251                                   __m128 *x2, __m128 *y2, __m128 *z2,
252                                   __m128 *x3, __m128 *y3, __m128 *z3) 
253 {
254     __m128 t1,t2,t3,t4;
255     t1            = _mm_loadu_ps(ptrA);
256     t2            = _mm_loadu_ps(ptrB);
257     t3            = _mm_loadu_ps(ptrC);
258     t4            = _mm_loadu_ps(ptrD);
259     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
260     *x1           = t1;
261     *y1           = t2;
262     *z1           = t3;
263     *x2           = t4;
264     t1            = _mm_loadu_ps(ptrA+4);
265     t2            = _mm_loadu_ps(ptrB+4);
266     t3            = _mm_loadu_ps(ptrC+4);
267     t4            = _mm_loadu_ps(ptrD+4);
268     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
269     *y2           = t1;
270     *z2           = t2;
271     *x3           = t3;
272     *y3           = t4;
273     t1            = _mm_load_ss(ptrA+8);
274     t2            = _mm_load_ss(ptrB+8);
275     t3            = _mm_load_ss(ptrC+8);
276     t4            = _mm_load_ss(ptrD+8);
277     t1            = _mm_unpacklo_ps(t1,t3);
278     t3            = _mm_unpacklo_ps(t2,t4);
279     *z3           = _mm_unpacklo_ps(t1,t3);
280 }
281
282
283 static void
284 gmx_mm_load_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
285                                   __m128 *x1, __m128 *y1, __m128 *z1,
286                                   __m128 *x2, __m128 *y2, __m128 *z2,
287                                   __m128 *x3, __m128 *y3, __m128 *z3,
288                                   __m128 *x4, __m128 *y4, __m128 *z4) 
289 {
290     __m128 t1,t2,t3,t4;
291     t1            = _mm_loadu_ps(ptrA);
292     t2            = _mm_loadu_ps(ptrB);
293     t3            = _mm_loadu_ps(ptrC);
294     t4            = _mm_loadu_ps(ptrD);
295     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
296     *x1           = t1;
297     *y1           = t2;
298     *z1           = t3;
299     *x2           = t4;
300     t1            = _mm_loadu_ps(ptrA+4);
301     t2            = _mm_loadu_ps(ptrB+4);
302     t3            = _mm_loadu_ps(ptrC+4);
303     t4            = _mm_loadu_ps(ptrD+4);
304     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
305     *y2           = t1;
306     *z2           = t2;
307     *x3           = t3;
308     *y3           = t4;
309     t1            = _mm_loadu_ps(ptrA+8);
310     t2            = _mm_loadu_ps(ptrB+8);
311     t3            = _mm_loadu_ps(ptrC+8);
312     t4            = _mm_loadu_ps(ptrD+8);
313     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
314     *z3           = t1;
315     *x4           = t2;
316     *y4           = t3;
317     *z4           = t4;
318 }
319
320
321 /* Routines to increment rvec in memory, typically use for j particle force updates */
322 static void
323 gmx_mm_increment_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
324 {
325     __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
326     __m128 t1;
327
328     t1  = _mm_loadu_ps(ptrA);
329     xyz = _mm_and_ps(mask,xyz);
330     t1  = _mm_add_ps(t1,xyz);
331     _mm_storeu_ps(ptrA,t1);
332 }
333
334
335 static void
336 gmx_mm_increment_3rvec_1ptr_noswizzle_ps(float *ptrA, 
337                                          __m128 xyz1, __m128 xyz2, __m128 xyz3)
338 {
339     __m128 t1,t2,t3,t4;
340     __m128 tA,tB,tC;
341
342     tA   = _mm_loadu_ps(ptrA);
343     tB   = _mm_loadu_ps(ptrA+4);
344     tC   = _mm_load_ss(ptrA+8);
345
346     t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
347     t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
348
349     t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
350     t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
351
352     t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
353
354     tA   = _mm_add_ps(tA,t3);
355     tB   = _mm_add_ps(tB,t4);
356     tC   = _mm_add_ss(tC,t2);
357
358     _mm_storeu_ps(ptrA,tA);
359     _mm_storeu_ps(ptrA+4,tB);
360     _mm_store_ss(ptrA+8,tC);
361 }
362
363 static void
364 gmx_mm_increment_4rvec_1ptr_noswizzle_ps(float *ptrA, 
365                                          __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
366 {
367     __m128 t1,t2,t3,t4,t5;
368     __m128 tA,tB,tC;
369
370     tA   = _mm_loadu_ps(ptrA);
371     tB   = _mm_loadu_ps(ptrA+4);
372     tC   = _mm_loadu_ps(ptrA+8);
373
374     t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
375     t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
376
377     t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
378     t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
379
380     t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
381     t5   = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
382
383     t2   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0));  /*  x4 x4 z3 z3 */
384     t5   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
385
386     tA   = _mm_add_ps(tA,t3);
387     tB   = _mm_add_ps(tB,t4);
388     tC   = _mm_add_ps(tC,t5);
389
390     _mm_storeu_ps(ptrA,tA);
391     _mm_storeu_ps(ptrA+4,tB);
392     _mm_storeu_ps(ptrA+8,tC);
393
394 }
395
396
397
398 static void
399 gmx_mm_increment_1rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC,float *ptrD,
400                                        __m128 x1, __m128 y1, __m128 z1) 
401 {
402     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
403     t5          = _mm_unpacklo_ps(y1,z1);
404     t6          = _mm_unpackhi_ps(y1,z1);
405     t7          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
406     t8          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
407     t9          = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
408     t10         = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
409     t1          = _mm_load_ss(ptrA);
410     t1          = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
411     t1          = _mm_add_ps(t1,t7);
412     _mm_store_ss(ptrA,t1);
413     _mm_storeh_pi((__m64 *)(ptrA+1),t1);
414     t2          = _mm_load_ss(ptrB);
415     t2          = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
416     t2          = _mm_add_ps(t2,t8);
417     _mm_store_ss(ptrB,t2);
418     _mm_storeh_pi((__m64 *)(ptrB+1),t2);
419     t3          = _mm_load_ss(ptrC);
420     t3          = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
421     t3          = _mm_add_ps(t3,t9);
422     _mm_store_ss(ptrC,t3);
423     _mm_storeh_pi((__m64 *)(ptrC+1),t3);
424     t4          = _mm_load_ss(ptrD);
425     t4          = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
426     t4          = _mm_add_ps(t4,t10);
427     _mm_store_ss(ptrD,t4);
428     _mm_storeh_pi((__m64 *)(ptrD+1),t4);
429 }
430
431
432
433
434 static void
435 gmx_mm_increment_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
436                                        __m128 x1, __m128 y1, __m128 z1,
437                                        __m128 x2, __m128 y2, __m128 z2,
438                                        __m128 x3, __m128 y3, __m128 z3) 
439 {
440     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
441     __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
442     __m128 t20,t21,t22,t23,t24,t25;
443
444     t13         = _mm_unpackhi_ps(x1,y1);
445     x1          = _mm_unpacklo_ps(x1,y1);
446     t14         = _mm_unpackhi_ps(z1,x2);
447     z1          = _mm_unpacklo_ps(z1,x2);
448     t15         = _mm_unpackhi_ps(y2,z2);
449     y2          = _mm_unpacklo_ps(y2,z2);
450     t16         = _mm_unpackhi_ps(x3,y3);
451     x3          = _mm_unpacklo_ps(x3,y3);
452     t17         = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
453     t18         = _mm_movehl_ps(z3,z3);
454     t19         = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
455     t20         = _mm_movelh_ps(x1,z1);
456     t21         = _mm_movehl_ps(z1,x1);
457     t22         = _mm_movelh_ps(t13,t14);
458     t14         = _mm_movehl_ps(t14,t13);
459     t23         = _mm_movelh_ps(y2,x3);
460     t24         = _mm_movehl_ps(x3,y2);
461     t25         = _mm_movelh_ps(t15,t16);
462     t16         = _mm_movehl_ps(t16,t15);
463     t1          = _mm_loadu_ps(ptrA);
464     t2          = _mm_loadu_ps(ptrA+4);
465     t3          = _mm_load_ss(ptrA+8);
466     t1          = _mm_add_ps(t1,t20);
467     t2          = _mm_add_ps(t2,t23);
468     t3          = _mm_add_ss(t3,z3);
469     _mm_storeu_ps(ptrA,t1);
470     _mm_storeu_ps(ptrA+4,t2);
471     _mm_store_ss(ptrA+8,t3);
472     t4          = _mm_loadu_ps(ptrB);
473     t5          = _mm_loadu_ps(ptrB+4);
474     t6          = _mm_load_ss(ptrB+8);
475     t4          = _mm_add_ps(t4,t21);
476     t5          = _mm_add_ps(t5,t24);
477     t6          = _mm_add_ss(t6,t17);
478     _mm_storeu_ps(ptrB,t4);
479     _mm_storeu_ps(ptrB+4,t5);
480     _mm_store_ss(ptrB+8,t6);
481     t7          = _mm_loadu_ps(ptrC);
482     t8          = _mm_loadu_ps(ptrC+4);
483     t9          = _mm_load_ss(ptrC+8);
484     t7          = _mm_add_ps(t7,t22);
485     t8          = _mm_add_ps(t8,t25);
486     t9          = _mm_add_ss(t9,t18);
487     _mm_storeu_ps(ptrC,t7);
488     _mm_storeu_ps(ptrC+4,t8);
489     _mm_store_ss(ptrC+8,t9);
490     t10         = _mm_loadu_ps(ptrD);
491     t11         = _mm_loadu_ps(ptrD+4);
492     t12         = _mm_load_ss(ptrD+8);
493     t10         = _mm_add_ps(t10,t14);
494     t11         = _mm_add_ps(t11,t16);
495     t12         = _mm_add_ss(t12,t19);
496     _mm_storeu_ps(ptrD,t10);
497     _mm_storeu_ps(ptrD+4,t11);
498     _mm_store_ss(ptrD+8,t12);
499 }
500
501
502 static void
503 gmx_mm_increment_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
504                                        __m128 x1, __m128 y1, __m128 z1,
505                                        __m128 x2, __m128 y2, __m128 z2,
506                                        __m128 x3, __m128 y3, __m128 z3,
507                                        __m128 x4, __m128 y4, __m128 z4) 
508 {
509     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
510     __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
511     __m128 t23,t24;
512     t13         = _mm_unpackhi_ps(x1,y1);
513     x1          = _mm_unpacklo_ps(x1,y1);
514     t14         = _mm_unpackhi_ps(z1,x2);
515     z1          = _mm_unpacklo_ps(z1,x2);
516     t15         = _mm_unpackhi_ps(y2,z2);
517     y2          = _mm_unpacklo_ps(y2,z2);
518     t16         = _mm_unpackhi_ps(x3,y3);
519     x3          = _mm_unpacklo_ps(x3,y3);
520     t17         = _mm_unpackhi_ps(z3,x4);
521     z3          = _mm_unpacklo_ps(z3,x4);
522     t18         = _mm_unpackhi_ps(y4,z4);
523     y4          = _mm_unpacklo_ps(y4,z4);
524     t19         = _mm_movelh_ps(x1,z1);
525     z1          = _mm_movehl_ps(z1,x1);
526     t20         = _mm_movelh_ps(t13,t14);
527     t14         = _mm_movehl_ps(t14,t13);
528     t21         = _mm_movelh_ps(y2,x3);
529     x3          = _mm_movehl_ps(x3,y2);
530     t22         = _mm_movelh_ps(t15,t16);
531     t16         = _mm_movehl_ps(t16,t15);
532     t23         = _mm_movelh_ps(z3,y4);
533     y4          = _mm_movehl_ps(y4,z3);
534     t24         = _mm_movelh_ps(t17,t18);
535     t18         = _mm_movehl_ps(t18,t17);
536     t1          = _mm_loadu_ps(ptrA);
537     t2          = _mm_loadu_ps(ptrA+4);
538     t3          = _mm_loadu_ps(ptrA+8);
539     t1          = _mm_add_ps(t1,t19);
540     t2          = _mm_add_ps(t2,t21);
541     t3          = _mm_add_ps(t3,t23);
542     _mm_storeu_ps(ptrA,t1);
543     _mm_storeu_ps(ptrA+4,t2);
544     _mm_storeu_ps(ptrA+8,t3);
545     t4          = _mm_loadu_ps(ptrB);
546     t5          = _mm_loadu_ps(ptrB+4);
547     t6          = _mm_loadu_ps(ptrB+8);
548     t4          = _mm_add_ps(t4,z1);
549     t5          = _mm_add_ps(t5,x3);
550     t6          = _mm_add_ps(t6,y4);
551     _mm_storeu_ps(ptrB,t4);
552     _mm_storeu_ps(ptrB+4,t5);
553     _mm_storeu_ps(ptrB+8,t6);
554     t7          = _mm_loadu_ps(ptrC);
555     t8          = _mm_loadu_ps(ptrC+4);
556     t9          = _mm_loadu_ps(ptrC+8);
557     t7          = _mm_add_ps(t7,t20);
558     t8          = _mm_add_ps(t8,t22);
559     t9          = _mm_add_ps(t9,t24);
560     _mm_storeu_ps(ptrC,t7);
561     _mm_storeu_ps(ptrC+4,t8);
562     _mm_storeu_ps(ptrC+8,t9);
563     t10         = _mm_loadu_ps(ptrD);
564     t11         = _mm_loadu_ps(ptrD+4);
565     t12         = _mm_loadu_ps(ptrD+8);
566     t10         = _mm_add_ps(t10,t14);
567     t11         = _mm_add_ps(t11,t16);
568     t12         = _mm_add_ps(t12,t18);
569     _mm_storeu_ps(ptrD,t10);
570     _mm_storeu_ps(ptrD+4,t11);
571     _mm_storeu_ps(ptrD+8,t12);
572 }
573
574
575 /* Routines to decrement rvec in memory */
576 static void
577 gmx_mm_decrement_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
578 {
579     __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
580     __m128 t1;
581
582     t1  = _mm_loadu_ps(ptrA);
583     xyz = _mm_and_ps(mask,xyz);
584     t1  = _mm_sub_ps(t1,xyz);
585     _mm_storeu_ps(ptrA,t1);
586 }
587
588
589 static void
590 gmx_mm_decrement_3rvec_1ptr_noswizzle_ps(float *ptrA, 
591                                          __m128 xyz1, __m128 xyz2, __m128 xyz3)
592 {
593     __m128 t1,t2,t3,t4;
594     __m128 tA,tB,tC;
595
596     tA   = _mm_loadu_ps(ptrA);
597     tB   = _mm_loadu_ps(ptrA+4);
598     tC   = _mm_load_ss(ptrA+8);
599
600     t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
601     t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
602
603     t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
604     t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
605
606     t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
607
608     tA   = _mm_sub_ps(tA,t3);
609     tB   = _mm_sub_ps(tB,t4);
610     tC   = _mm_sub_ss(tC,t2);
611
612     _mm_storeu_ps(ptrA,tA);
613     _mm_storeu_ps(ptrA+4,tB);
614     _mm_store_ss(ptrA+8,tC);
615 }
616
617 static void
618 gmx_mm_decrement_4rvec_1ptr_noswizzle_ps(float *ptrA, 
619                                          __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
620 {
621     __m128 t1,t2,t3,t4,t5;
622     __m128 tA,tB,tC;
623
624     tA   = _mm_loadu_ps(ptrA);
625     tB   = _mm_loadu_ps(ptrA+4);
626     tC   = _mm_loadu_ps(ptrA+8);
627
628     t1   = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
629     t2   = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
630
631     t3   = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3));   /* z1 z1 x2 x2 */
632     t3   = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
633
634     t4   = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
635
636     t5   = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
637     t2   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0));  /*  x4 x4 z3 z3 */
638     t5   = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
639
640     tA   = _mm_sub_ps(tA,t3);
641     tB   = _mm_sub_ps(tB,t4);
642     tC   = _mm_sub_ps(tC,t5);
643
644     _mm_storeu_ps(ptrA,tA);
645     _mm_storeu_ps(ptrA+4,tB);
646     _mm_storeu_ps(ptrA+8,tC);
647
648 }
649
650
651 static void
652 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
653                                        float * gmx_restrict ptrB,
654                                        float * gmx_restrict ptrC,
655                                        float * gmx_restrict ptrD,
656                                        __m128 x1, __m128 y1, __m128 z1)
657 {
658     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
659     t5          = _mm_unpacklo_ps(y1,z1);
660     t6          = _mm_unpackhi_ps(y1,z1);
661     t7          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
662     t8          = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
663     t9          = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
664     t10         = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
665     t1          = _mm_load_ss(ptrA);
666     t1          = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
667     t1          = _mm_sub_ps(t1,t7);
668     _mm_store_ss(ptrA,t1);
669     _mm_storeh_pi((__m64 *)(ptrA+1),t1);
670     t2          = _mm_load_ss(ptrB);
671     t2          = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
672     t2          = _mm_sub_ps(t2,t8);
673     _mm_store_ss(ptrB,t2);
674     _mm_storeh_pi((__m64 *)(ptrB+1),t2);
675     t3          = _mm_load_ss(ptrC);
676     t3          = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
677     t3          = _mm_sub_ps(t3,t9);
678     _mm_store_ss(ptrC,t3);
679     _mm_storeh_pi((__m64 *)(ptrC+1),t3);
680     t4          = _mm_load_ss(ptrD);
681     t4          = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
682     t4          = _mm_sub_ps(t4,t10);
683     _mm_store_ss(ptrD,t4);
684     _mm_storeh_pi((__m64 *)(ptrD+1),t4);
685 }
686
687
688
689 static void
690 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
691                                        __m128 x1, __m128 y1, __m128 z1,
692                                        __m128 x2, __m128 y2, __m128 z2,
693                                        __m128 x3, __m128 y3, __m128 z3) 
694 {
695     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
696     __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
697     __m128 t20,t21,t22,t23,t24,t25;
698
699     t13         = _mm_unpackhi_ps(x1,y1);
700     x1          = _mm_unpacklo_ps(x1,y1);
701     t14         = _mm_unpackhi_ps(z1,x2);
702     z1          = _mm_unpacklo_ps(z1,x2);
703     t15         = _mm_unpackhi_ps(y2,z2);
704     y2          = _mm_unpacklo_ps(y2,z2);
705     t16         = _mm_unpackhi_ps(x3,y3);
706     x3          = _mm_unpacklo_ps(x3,y3);
707     t17         = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
708     t18         = _mm_movehl_ps(z3,z3);
709     t19         = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
710     t20         = _mm_movelh_ps(x1,z1);
711     t21         = _mm_movehl_ps(z1,x1);
712     t22         = _mm_movelh_ps(t13,t14);
713     t14         = _mm_movehl_ps(t14,t13);
714     t23         = _mm_movelh_ps(y2,x3);
715     t24         = _mm_movehl_ps(x3,y2);
716     t25         = _mm_movelh_ps(t15,t16);
717     t16         = _mm_movehl_ps(t16,t15);
718     t1          = _mm_loadu_ps(ptrA);
719     t2          = _mm_loadu_ps(ptrA+4);
720     t3          = _mm_load_ss(ptrA+8);
721     t1          = _mm_sub_ps(t1,t20);
722     t2          = _mm_sub_ps(t2,t23);
723     t3          = _mm_sub_ss(t3,z3);
724     _mm_storeu_ps(ptrA,t1);
725     _mm_storeu_ps(ptrA+4,t2);
726     _mm_store_ss(ptrA+8,t3);
727     t4          = _mm_loadu_ps(ptrB);
728     t5          = _mm_loadu_ps(ptrB+4);
729     t6          = _mm_load_ss(ptrB+8);
730     t4          = _mm_sub_ps(t4,t21);
731     t5          = _mm_sub_ps(t5,t24);
732     t6          = _mm_sub_ss(t6,t17);
733     _mm_storeu_ps(ptrB,t4);
734     _mm_storeu_ps(ptrB+4,t5);
735     _mm_store_ss(ptrB+8,t6);
736     t7          = _mm_loadu_ps(ptrC);
737     t8          = _mm_loadu_ps(ptrC+4);
738     t9          = _mm_load_ss(ptrC+8);
739     t7          = _mm_sub_ps(t7,t22);
740     t8          = _mm_sub_ps(t8,t25);
741     t9          = _mm_sub_ss(t9,t18);
742     _mm_storeu_ps(ptrC,t7);
743     _mm_storeu_ps(ptrC+4,t8);
744     _mm_store_ss(ptrC+8,t9);
745     t10         = _mm_loadu_ps(ptrD);
746     t11         = _mm_loadu_ps(ptrD+4);
747     t12         = _mm_load_ss(ptrD+8);
748     t10         = _mm_sub_ps(t10,t14);
749     t11         = _mm_sub_ps(t11,t16);
750     t12         = _mm_sub_ss(t12,t19);
751     _mm_storeu_ps(ptrD,t10);
752     _mm_storeu_ps(ptrD+4,t11);
753     _mm_store_ss(ptrD+8,t12);
754 }
755
756
757 static void
758 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
759                                        __m128 x1, __m128 y1, __m128 z1,
760                                        __m128 x2, __m128 y2, __m128 z2,
761                                        __m128 x3, __m128 y3, __m128 z3,
762                                        __m128 x4, __m128 y4, __m128 z4) 
763 {
764     __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
765     __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
766     __m128 t23,t24;
767     t13         = _mm_unpackhi_ps(x1,y1);
768     x1          = _mm_unpacklo_ps(x1,y1);
769     t14         = _mm_unpackhi_ps(z1,x2);
770     z1          = _mm_unpacklo_ps(z1,x2);
771     t15         = _mm_unpackhi_ps(y2,z2);
772     y2          = _mm_unpacklo_ps(y2,z2);
773     t16         = _mm_unpackhi_ps(x3,y3);
774     x3          = _mm_unpacklo_ps(x3,y3);
775     t17         = _mm_unpackhi_ps(z3,x4);
776     z3          = _mm_unpacklo_ps(z3,x4);
777     t18         = _mm_unpackhi_ps(y4,z4);
778     y4          = _mm_unpacklo_ps(y4,z4);
779     t19         = _mm_movelh_ps(x1,z1);
780     z1          = _mm_movehl_ps(z1,x1);
781     t20         = _mm_movelh_ps(t13,t14);
782     t14         = _mm_movehl_ps(t14,t13);
783     t21         = _mm_movelh_ps(y2,x3);
784     x3          = _mm_movehl_ps(x3,y2);
785     t22         = _mm_movelh_ps(t15,t16);
786     t16         = _mm_movehl_ps(t16,t15);
787     t23         = _mm_movelh_ps(z3,y4);
788     y4          = _mm_movehl_ps(y4,z3);
789     t24         = _mm_movelh_ps(t17,t18);
790     t18         = _mm_movehl_ps(t18,t17);
791     t1          = _mm_loadu_ps(ptrA);
792     t2          = _mm_loadu_ps(ptrA+4);
793     t3          = _mm_loadu_ps(ptrA+8);
794     t1          = _mm_sub_ps(t1,t19);
795     t2          = _mm_sub_ps(t2,t21);
796     t3          = _mm_sub_ps(t3,t23);
797     _mm_storeu_ps(ptrA,t1);
798     _mm_storeu_ps(ptrA+4,t2);
799     _mm_storeu_ps(ptrA+8,t3);
800     t4          = _mm_loadu_ps(ptrB);
801     t5          = _mm_loadu_ps(ptrB+4);
802     t6          = _mm_loadu_ps(ptrB+8);
803     t4          = _mm_sub_ps(t4,z1);
804     t5          = _mm_sub_ps(t5,x3);
805     t6          = _mm_sub_ps(t6,y4);
806     _mm_storeu_ps(ptrB,t4);
807     _mm_storeu_ps(ptrB+4,t5);
808     _mm_storeu_ps(ptrB+8,t6);
809     t7          = _mm_loadu_ps(ptrC);
810     t8          = _mm_loadu_ps(ptrC+4);
811     t9          = _mm_loadu_ps(ptrC+8);
812     t7          = _mm_sub_ps(t7,t20);
813     t8          = _mm_sub_ps(t8,t22);
814     t9          = _mm_sub_ps(t9,t24);
815     _mm_storeu_ps(ptrC,t7);
816     _mm_storeu_ps(ptrC+4,t8);
817     _mm_storeu_ps(ptrC+8,t9);
818     t10         = _mm_loadu_ps(ptrD);
819     t11         = _mm_loadu_ps(ptrD+4);
820     t12         = _mm_loadu_ps(ptrD+8);
821     t10         = _mm_sub_ps(t10,t14);
822     t11         = _mm_sub_ps(t11,t16);
823     t12         = _mm_sub_ps(t12,t18);
824     _mm_storeu_ps(ptrD,t10);
825     _mm_storeu_ps(ptrD+4,t11);
826     _mm_storeu_ps(ptrD+8,t12);
827 }
828
829
830
831 static gmx_inline void
832 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
833                                       float *fptr,
834                                       float *fshiftptr)
835 {
836     __m128 t1,t2,t3;
837
838     /* transpose data */
839     t1 = fix1;
840     _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
841     fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
842
843     t2 = _mm_load_ss(fptr);
844     t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
845     t3 = _mm_load_ss(fshiftptr);
846     t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
847
848     t2 = _mm_add_ps(t2,fix1);
849     t3 = _mm_add_ps(t3,fix1);
850
851     _mm_store_ss(fptr,t2);
852     _mm_storeh_pi((__m64 *)(fptr+1),t2);
853     _mm_store_ss(fshiftptr,t3);
854     _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
855 }
856
857 static gmx_inline void
858 gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
859                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
860                                       float *fptr,
861                                       float *fshiftptr)
862 {
863     __m128 t1,t2,t4;
864
865     /* transpose data */
866     _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
867     t1 = _mm_unpacklo_ps(fiy2,fiz2);
868     t2 = _mm_unpackhi_ps(fiy2,fiz2);
869
870     fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
871     t1   = _mm_add_ps(t1,t2);
872     t2   = _mm_movehl_ps(t2,t1);
873     fiy2 = _mm_add_ps(t1,t2);
874
875     _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
876     t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
877     _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
878
879     t4 = _mm_load_ss(fshiftptr+2);
880     t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
881
882     t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
883     t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
884     t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
885
886     t1 = _mm_add_ps(t1,t2);
887     t1 = _mm_add_ps(t1,t4); /* y x - z */
888
889     _mm_store_ss(fshiftptr+2,t1);
890     _mm_storeh_pi((__m64 *)(fshiftptr),t1);
891 }
892
893
894
895 static gmx_inline void
896 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
897                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
898                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
899                                       float *fptr,
900                                       float *fshiftptr)
901 {
902     __m128 t1,t2,t3,t4;
903
904     /* transpose data */
905     _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
906     _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
907     t2   = _mm_movehl_ps(_mm_setzero_ps(),fiz3);
908     t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
909     t3   = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
910
911     fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
912     fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
913     fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1)  , _mm_add_ps(t2,t3));
914
915     _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
916     _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
917     _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
918
919     t4 = _mm_load_ss(fshiftptr+2);
920     t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
921
922     t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
923     t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
924     t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
925     t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
926
927     t1 = _mm_add_ps(t1,t2);
928     t3 = _mm_add_ps(t3,t4);
929     t1 = _mm_add_ps(t1,t3); /* y x - z */
930
931     _mm_store_ss(fshiftptr+2,t1);
932     _mm_storeh_pi((__m64 *)(fshiftptr),t1);
933 }
934
935
936 static gmx_inline void
937 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
938                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
939                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
940                                       __m128 fix4, __m128 fiy4, __m128 fiz4,
941                                       float *fptr,
942                                       float *fshiftptr)
943 {
944     __m128 t1,t2,t3,t4,t5;
945
946     /* transpose data */
947     _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
948     _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
949     _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
950
951     fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
952     fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
953     fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
954
955     _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
956     _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
957     _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
958
959     t5 = _mm_load_ss(fshiftptr+2);
960     t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
961
962     t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
963     t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
964     t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
965     t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
966     t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));
967
968     t1 = _mm_add_ps(t1,t2);
969     t3 = _mm_add_ps(t3,t4);
970     t1 = _mm_add_ps(t1,t3);
971     t5 = _mm_add_ps(t5,t1);
972
973     _mm_store_ss(fshiftptr+2,t5);
974     _mm_storeh_pi((__m64 *)(fshiftptr),t5);
975 }
976
977
978
979 static void
980 gmx_mm_update_1pot_ps(__m128 pot1, float *ptrA)
981 {
982     pot1 = _mm_add_ps(pot1,_mm_movehl_ps(_mm_setzero_ps(),pot1));
983     pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
984     _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
985 }
986
987 static void
988 gmx_mm_update_2pot_ps(__m128 pot1, float *ptrA,
989                       __m128 pot2, float *ptrB)
990 {
991     __m128 t1,t2;
992     t1   = _mm_movehl_ps(pot2,pot1);
993     t2   = _mm_movelh_ps(pot1,pot2);
994     t1   = _mm_add_ps(t1,t2);
995     t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
996     pot1 = _mm_add_ps(t1,t2);
997     pot2 = _mm_movehl_ps(t2,pot1);
998     _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
999     _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
1000 }
1001
1002
1003 static void
1004 gmx_mm_update_4pot_ps(__m128 pot1, float *ptrA,
1005                       __m128 pot2, float *ptrB,
1006                       __m128 pot3, float *ptrC,
1007                       __m128 pot4, float *ptrD)
1008 {
1009     _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
1010     pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
1011     pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
1012     pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
1013     pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
1014     _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
1015     _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
1016     _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
1017     _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
1018 }
1019
1020
1021 #endif /* _kernelutil_x86_sse2_single_h_ */