Merge release-4-6 into release-5-0
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / kernelutil_x86_sse2_single.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 #ifndef _kernelutil_x86_sse2_single_h_
36 #define _kernelutil_x86_sse2_single_h_
37
38 /* We require SSE2 now! */
39
40 #include <math.h>
41
42 #define gmx_mm_castsi128_ps   _mm_castsi128_ps
43
44 #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
45
46
47 /* Normal sum of four xmm registers */
48 #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
49
50 static gmx_inline __m128
51 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
52 {
53     return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
54 }
55
56 static int
57 gmx_mm_any_lt(__m128 a, __m128 b)
58 {
59     return _mm_movemask_ps(_mm_cmplt_ps(a, b));
60 }
61
62 /* Load a single value from 1-4 places, merge into xmm register */
63
64 static __m128
65 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
66                              const float * gmx_restrict ptrB,
67                              const float * gmx_restrict ptrC,
68                              const float * gmx_restrict ptrD)
69 {
70     __m128 t1, t2;
71
72     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
73     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
74     return _mm_unpacklo_ps(t1, t2);
75 }
76
77 static void
78 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
79                               float * gmx_restrict ptrB,
80                               float * gmx_restrict ptrC,
81                               float * gmx_restrict ptrD,
82                               __m128               xmm1)
83 {
84     __m128 t2, t3, t4;
85
86     t3       = _mm_movehl_ps(_mm_setzero_ps(), xmm1);
87     t2       = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1, 1, 1, 1));
88     t4       = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 1, 1, 1));
89     _mm_store_ss(ptrA, xmm1);
90     _mm_store_ss(ptrB, t2);
91     _mm_store_ss(ptrC, t3);
92     _mm_store_ss(ptrD, t4);
93 }
94
95 /* Similar to store, but increments value in memory */
96 static void
97 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
98                                   float * gmx_restrict ptrB,
99                                   float * gmx_restrict ptrC,
100                                   float * gmx_restrict ptrD, __m128 xmm1)
101 {
102     __m128 tmp;
103
104     tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
105     tmp = _mm_add_ps(tmp, xmm1);
106     gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
107 }
108
109
110 static void
111 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
112                              const float * gmx_restrict p2,
113                              const float * gmx_restrict p3,
114                              const float * gmx_restrict p4,
115                              __m128 * gmx_restrict      c6,
116                              __m128 * gmx_restrict      c12)
117 {
118     __m128 t1, t2, t3, t4;
119
120     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1);   /* - - c12a  c6a */
121     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2);   /* - - c12b  c6b */
122     t3   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3);   /* - - c12c  c6c */
123     t4   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4);   /* - - c12d  c6d */
124     t1   = _mm_unpacklo_ps(t1, t2);
125     t2   = _mm_unpacklo_ps(t3, t4);
126     *c6  = _mm_movelh_ps(t1, t2);
127     *c12 = _mm_movehl_ps(t2, t1);
128 }
129
130 /* Routines to load 1-4 rvec from 4 places.
131  * We mainly use these to load coordinates. The extra routines
132  * are very efficient for the water-water loops, since we e.g.
133  * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
134  */
135
136
137 static gmx_inline void
138 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
139                                          const float * gmx_restrict xyz,
140                                          __m128 * gmx_restrict      x1,
141                                          __m128 * gmx_restrict      y1,
142                                          __m128 * gmx_restrict      z1)
143 {
144     __m128 t1, t2, t3, t4;
145
146     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
147     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
148     t3   = _mm_load_ss(xyz_shift+2);
149     t4   = _mm_load_ss(xyz+2);
150     t1   = _mm_add_ps(t1, t2);
151     t3   = _mm_add_ss(t3, t4);
152
153     *x1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
154     *y1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
155     *z1  = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
156 }
157
158
159 static gmx_inline void
160 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
161                                          const float * gmx_restrict xyz,
162                                          __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
163                                          __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
164                                          __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
165 {
166     __m128 tA, tB;
167     __m128 t1, t2, t3, t4, t5, t6;
168
169     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
170     tB   = _mm_load_ss(xyz_shift+2);
171
172     t1   = _mm_loadu_ps(xyz);
173     t2   = _mm_loadu_ps(xyz+4);
174     t3   = _mm_load_ss(xyz+8);
175
176     tA   = _mm_movelh_ps(tA, tB);
177     t4   = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(0, 2, 1, 0));
178     t5   = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(1, 0, 2, 1));
179     t6   = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(2, 1, 0, 2));
180
181     t1   = _mm_add_ps(t1, t4);
182     t2   = _mm_add_ps(t2, t5);
183     t3   = _mm_add_ss(t3, t6);
184
185     *x1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
186     *y1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
187     *z1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
188     *x2  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
189     *y2  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 0));
190     *z2  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1, 1, 1, 1));
191     *x3  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2, 2, 2, 2));
192     *y3  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3, 3, 3, 3));
193     *z3  = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
194 }
195
196
197 static gmx_inline void
198 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
199                                          const float * gmx_restrict xyz,
200                                          __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
201                                          __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
202                                          __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
203                                          __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
204 {
205     __m128 tA, tB;
206     __m128 t1, t2, t3, t4, t5, t6;
207
208     tA   = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
209     tB   = _mm_load_ss(xyz_shift+2);
210
211     t1   = _mm_loadu_ps(xyz);
212     t2   = _mm_loadu_ps(xyz+4);
213     t3   = _mm_loadu_ps(xyz+8);
214
215     tA   = _mm_movelh_ps(tA, tB);
216     t4   = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(0, 2, 1, 0));
217     t5   = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(1, 0, 2, 1));
218     t6   = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(2, 1, 0, 2));
219
220     t1   = _mm_add_ps(t1, t4);
221     t2   = _mm_add_ps(t2, t5);
222     t3   = _mm_add_ps(t3, t6);
223
224     *x1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
225     *y1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
226     *z1  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
227     *x2  = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
228     *y2  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 0));
229     *z2  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1, 1, 1, 1));
230     *x3  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2, 2, 2, 2));
231     *y3  = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3, 3, 3, 3));
232     *z3  = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
233     *x4  = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 1, 1, 1));
234     *y4  = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2, 2, 2, 2));
235     *z4  = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3, 3, 3, 3));
236 }
237
238
239 static void
240 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
241                                   const float * gmx_restrict ptrB,
242                                   const float * gmx_restrict ptrC,
243                                   const float * gmx_restrict ptrD,
244                                   __m128 *      gmx_restrict x1,
245                                   __m128 *      gmx_restrict y1,
246                                   __m128 *      gmx_restrict z1)
247 {
248     __m128 t1, t2, t3, t4, t5, t6, t7, t8;
249     t1   = _mm_castpd_ps(_mm_load_sd((const double *)ptrA));
250     t2   = _mm_castpd_ps(_mm_load_sd((const double *)ptrB));
251     t3   = _mm_castpd_ps(_mm_load_sd((const double *)ptrC));
252     t4   = _mm_castpd_ps(_mm_load_sd((const double *)ptrD));
253     t5   = _mm_load_ss(ptrA+2);
254     t6   = _mm_load_ss(ptrB+2);
255     t7   = _mm_load_ss(ptrC+2);
256     t8   = _mm_load_ss(ptrD+2);
257     t1   = _mm_unpacklo_ps(t1, t2);
258     t3   = _mm_unpacklo_ps(t3, t4);
259     *x1  = _mm_movelh_ps(t1, t3);
260     *y1  = _mm_movehl_ps(t3, t1);
261     t5   = _mm_unpacklo_ps(t5, t6);
262     t7   = _mm_unpacklo_ps(t7, t8);
263     *z1  = _mm_movelh_ps(t5, t7);
264 }
265
266
267 static void
268 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
269                                   const float * gmx_restrict ptrB,
270                                   const float * gmx_restrict ptrC,
271                                   const float * gmx_restrict ptrD,
272                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
273                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
274                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
275 {
276     __m128 t1, t2, t3, t4;
277     t1            = _mm_loadu_ps(ptrA);
278     t2            = _mm_loadu_ps(ptrB);
279     t3            = _mm_loadu_ps(ptrC);
280     t4            = _mm_loadu_ps(ptrD);
281     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
282     *x1           = t1;
283     *y1           = t2;
284     *z1           = t3;
285     *x2           = t4;
286     t1            = _mm_loadu_ps(ptrA+4);
287     t2            = _mm_loadu_ps(ptrB+4);
288     t3            = _mm_loadu_ps(ptrC+4);
289     t4            = _mm_loadu_ps(ptrD+4);
290     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
291     *y2           = t1;
292     *z2           = t2;
293     *x3           = t3;
294     *y3           = t4;
295     t1            = _mm_load_ss(ptrA+8);
296     t2            = _mm_load_ss(ptrB+8);
297     t3            = _mm_load_ss(ptrC+8);
298     t4            = _mm_load_ss(ptrD+8);
299     t1            = _mm_unpacklo_ps(t1, t3);
300     t3            = _mm_unpacklo_ps(t2, t4);
301     *z3           = _mm_unpacklo_ps(t1, t3);
302 }
303
304
305 static void
306 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
307                                   const float * gmx_restrict ptrB,
308                                   const float * gmx_restrict ptrC,
309                                   const float * gmx_restrict ptrD,
310                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
311                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
312                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
313                                   __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
314 {
315     __m128 t1, t2, t3, t4;
316     t1            = _mm_loadu_ps(ptrA);
317     t2            = _mm_loadu_ps(ptrB);
318     t3            = _mm_loadu_ps(ptrC);
319     t4            = _mm_loadu_ps(ptrD);
320     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
321     *x1           = t1;
322     *y1           = t2;
323     *z1           = t3;
324     *x2           = t4;
325     t1            = _mm_loadu_ps(ptrA+4);
326     t2            = _mm_loadu_ps(ptrB+4);
327     t3            = _mm_loadu_ps(ptrC+4);
328     t4            = _mm_loadu_ps(ptrD+4);
329     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
330     *y2           = t1;
331     *z2           = t2;
332     *x3           = t3;
333     *y3           = t4;
334     t1            = _mm_loadu_ps(ptrA+8);
335     t2            = _mm_loadu_ps(ptrB+8);
336     t3            = _mm_loadu_ps(ptrC+8);
337     t4            = _mm_loadu_ps(ptrD+8);
338     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
339     *z3           = t1;
340     *x4           = t2;
341     *y4           = t3;
342     *z4           = t4;
343 }
344
345
346 static void
347 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
348                                        float * gmx_restrict ptrB,
349                                        float * gmx_restrict ptrC,
350                                        float * gmx_restrict ptrD,
351                                        __m128 x1, __m128 y1, __m128 z1)
352 {
353     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
354     t5          = _mm_unpacklo_ps(y1, z1);
355     t6          = _mm_unpackhi_ps(y1, z1);
356     t7          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
357     t8          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
358     t9          = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
359     t10         = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
360     t1          = _mm_load_ss(ptrA);
361     t1          = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
362     t1          = _mm_sub_ps(t1, t7);
363     _mm_store_ss(ptrA, t1);
364     _mm_storeh_pi((__m64 *)(ptrA+1), t1);
365     t2          = _mm_load_ss(ptrB);
366     t2          = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
367     t2          = _mm_sub_ps(t2, t8);
368     _mm_store_ss(ptrB, t2);
369     _mm_storeh_pi((__m64 *)(ptrB+1), t2);
370     t3          = _mm_load_ss(ptrC);
371     t3          = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
372     t3          = _mm_sub_ps(t3, t9);
373     _mm_store_ss(ptrC, t3);
374     _mm_storeh_pi((__m64 *)(ptrC+1), t3);
375     t4          = _mm_load_ss(ptrD);
376     t4          = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
377     t4          = _mm_sub_ps(t4, t10);
378     _mm_store_ss(ptrD, t4);
379     _mm_storeh_pi((__m64 *)(ptrD+1), t4);
380 }
381
382
383
384 #if defined (_MSC_VER) && defined(_M_IX86)
385 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
386 #define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
387                                                _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
388     { \
389         __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
390         __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
391         __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
392         _t13         = _mm_unpackhi_ps(_x1, _y1); \
393         _x1          = _mm_unpacklo_ps(_x1, _y1); \
394         _t14         = _mm_unpackhi_ps(_z1, _x2); \
395         _z1          = _mm_unpacklo_ps(_z1, _x2); \
396         _t15         = _mm_unpackhi_ps(_y2, _z2); \
397         _y2          = _mm_unpacklo_ps(_y2, _z2); \
398         _t16         = _mm_unpackhi_ps(_x3, _y3); \
399         _x3          = _mm_unpacklo_ps(_x3, _y3); \
400         _t17         = _mm_shuffle_ps(_z3, _z3, _MM_SHUFFLE(0, 0, 0, 1)); \
401         _t18         = _mm_movehl_ps(_z3, _z3); \
402         _t19         = _mm_shuffle_ps(_t18, _t18, _MM_SHUFFLE(0, 0, 0, 1)); \
403         _t20         = _mm_movelh_ps(_x1, _z1); \
404         _t21         = _mm_movehl_ps(_z1, _x1); \
405         _t22         = _mm_movelh_ps(_t13, _t14); \
406         _t14         = _mm_movehl_ps(_t14, _t13); \
407         _t23         = _mm_movelh_ps(_y2, _x3); \
408         _t24         = _mm_movehl_ps(_x3, _y2); \
409         _t25         = _mm_movelh_ps(_t15, _t16); \
410         _t16         = _mm_movehl_ps(_t16, _t15); \
411         _t1          = _mm_loadu_ps(ptrA); \
412         _t2          = _mm_loadu_ps(ptrA+4); \
413         _t3          = _mm_load_ss(ptrA+8); \
414         _t1          = _mm_sub_ps(_t1, _t20); \
415         _t2          = _mm_sub_ps(_t2, _t23); \
416         _t3          = _mm_sub_ss(_t3, _z3); \
417         _mm_storeu_ps(ptrA, _t1); \
418         _mm_storeu_ps(ptrA+4, _t2); \
419         _mm_store_ss(ptrA+8, _t3); \
420         _t4          = _mm_loadu_ps(ptrB); \
421         _t5          = _mm_loadu_ps(ptrB+4); \
422         _t6          = _mm_load_ss(ptrB+8); \
423         _t4          = _mm_sub_ps(_t4, _t21); \
424         _t5          = _mm_sub_ps(_t5, _t24); \
425         _t6          = _mm_sub_ss(_t6, _t17); \
426         _mm_storeu_ps(ptrB, _t4); \
427         _mm_storeu_ps(ptrB+4, _t5); \
428         _mm_store_ss(ptrB+8, _t6); \
429         _t7          = _mm_loadu_ps(ptrC); \
430         _t8          = _mm_loadu_ps(ptrC+4); \
431         _t9          = _mm_load_ss(ptrC+8); \
432         _t7          = _mm_sub_ps(_t7, _t22); \
433         _t8          = _mm_sub_ps(_t8, _t25); \
434         _t9          = _mm_sub_ss(_t9, _t18); \
435         _mm_storeu_ps(ptrC, _t7); \
436         _mm_storeu_ps(ptrC+4, _t8); \
437         _mm_store_ss(ptrC+8, _t9); \
438         _t10         = _mm_loadu_ps(ptrD); \
439         _t11         = _mm_loadu_ps(ptrD+4); \
440         _t12         = _mm_load_ss(ptrD+8); \
441         _t10         = _mm_sub_ps(_t10, _t14); \
442         _t11         = _mm_sub_ps(_t11, _t16); \
443         _t12         = _mm_sub_ss(_t12, _t19); \
444         _mm_storeu_ps(ptrD, _t10); \
445         _mm_storeu_ps(ptrD+4, _t11); \
446         _mm_store_ss(ptrD+8, _t12); \
447     }
448 #else
449 /* Real function for sane compilers */
450 static void
451 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
452                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
453                                        __m128 x1, __m128 y1, __m128 z1,
454                                        __m128 x2, __m128 y2, __m128 z2,
455                                        __m128 x3, __m128 y3, __m128 z3)
456 {
457     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
458     __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
459     __m128 t20, t21, t22, t23, t24, t25;
460
461     t13         = _mm_unpackhi_ps(x1, y1);
462     x1          = _mm_unpacklo_ps(x1, y1);
463     t14         = _mm_unpackhi_ps(z1, x2);
464     z1          = _mm_unpacklo_ps(z1, x2);
465     t15         = _mm_unpackhi_ps(y2, z2);
466     y2          = _mm_unpacklo_ps(y2, z2);
467     t16         = _mm_unpackhi_ps(x3, y3);
468     x3          = _mm_unpacklo_ps(x3, y3);
469     t17         = _mm_shuffle_ps(z3, z3, _MM_SHUFFLE(0, 0, 0, 1));
470     t18         = _mm_movehl_ps(z3, z3);
471     t19         = _mm_shuffle_ps(t18, t18, _MM_SHUFFLE(0, 0, 0, 1));
472     t20         = _mm_movelh_ps(x1, z1);
473     t21         = _mm_movehl_ps(z1, x1);
474     t22         = _mm_movelh_ps(t13, t14);
475     t14         = _mm_movehl_ps(t14, t13);
476     t23         = _mm_movelh_ps(y2, x3);
477     t24         = _mm_movehl_ps(x3, y2);
478     t25         = _mm_movelh_ps(t15, t16);
479     t16         = _mm_movehl_ps(t16, t15);
480     t1          = _mm_loadu_ps(ptrA);
481     t2          = _mm_loadu_ps(ptrA+4);
482     t3          = _mm_load_ss(ptrA+8);
483     t1          = _mm_sub_ps(t1, t20);
484     t2          = _mm_sub_ps(t2, t23);
485     t3          = _mm_sub_ss(t3, z3);
486     _mm_storeu_ps(ptrA, t1);
487     _mm_storeu_ps(ptrA+4, t2);
488     _mm_store_ss(ptrA+8, t3);
489     t4          = _mm_loadu_ps(ptrB);
490     t5          = _mm_loadu_ps(ptrB+4);
491     t6          = _mm_load_ss(ptrB+8);
492     t4          = _mm_sub_ps(t4, t21);
493     t5          = _mm_sub_ps(t5, t24);
494     t6          = _mm_sub_ss(t6, t17);
495     _mm_storeu_ps(ptrB, t4);
496     _mm_storeu_ps(ptrB+4, t5);
497     _mm_store_ss(ptrB+8, t6);
498     t7          = _mm_loadu_ps(ptrC);
499     t8          = _mm_loadu_ps(ptrC+4);
500     t9          = _mm_load_ss(ptrC+8);
501     t7          = _mm_sub_ps(t7, t22);
502     t8          = _mm_sub_ps(t8, t25);
503     t9          = _mm_sub_ss(t9, t18);
504     _mm_storeu_ps(ptrC, t7);
505     _mm_storeu_ps(ptrC+4, t8);
506     _mm_store_ss(ptrC+8, t9);
507     t10         = _mm_loadu_ps(ptrD);
508     t11         = _mm_loadu_ps(ptrD+4);
509     t12         = _mm_load_ss(ptrD+8);
510     t10         = _mm_sub_ps(t10, t14);
511     t11         = _mm_sub_ps(t11, t16);
512     t12         = _mm_sub_ss(t12, t19);
513     _mm_storeu_ps(ptrD, t10);
514     _mm_storeu_ps(ptrD+4, t11);
515     _mm_store_ss(ptrD+8, t12);
516 }
517 #endif
518
519
520 #if defined (_MSC_VER) && defined(_M_IX86)
521 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
522 #define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
523                                                _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
524     { \
525         __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
526         __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
527         __m128 _t23, _t24; \
528         _t13         = _mm_unpackhi_ps(_x1, _y1); \
529         _x1          = _mm_unpacklo_ps(_x1, _y1); \
530         _t14         = _mm_unpackhi_ps(_z1, _x2); \
531         _z1          = _mm_unpacklo_ps(_z1, _x2); \
532         _t15         = _mm_unpackhi_ps(_y2, _z2); \
533         _y2          = _mm_unpacklo_ps(_y2, _z2); \
534         _t16         = _mm_unpackhi_ps(_x3, _y3); \
535         _x3          = _mm_unpacklo_ps(_x3, _y3); \
536         _t17         = _mm_unpackhi_ps(_z3, _x4); \
537         _z3          = _mm_unpacklo_ps(_z3, _x4); \
538         _t18         = _mm_unpackhi_ps(_y4, _z4); \
539         _y4          = _mm_unpacklo_ps(_y4, _z4); \
540         _t19         = _mm_movelh_ps(_x1, _z1); \
541         _z1          = _mm_movehl_ps(_z1, _x1); \
542         _t20         = _mm_movelh_ps(_t13, _t14); \
543         _t14         = _mm_movehl_ps(_t14, _t13); \
544         _t21         = _mm_movelh_ps(_y2, _x3); \
545         _x3          = _mm_movehl_ps(_x3, _y2); \
546         _t22         = _mm_movelh_ps(_t15, _t16); \
547         _t16         = _mm_movehl_ps(_t16, _t15); \
548         _t23         = _mm_movelh_ps(_z3, _y4); \
549         _y4          = _mm_movehl_ps(_y4, _z3); \
550         _t24         = _mm_movelh_ps(_t17, _t18); \
551         _t18         = _mm_movehl_ps(_t18, _t17); \
552         _t1          = _mm_loadu_ps(ptrA); \
553         _t2          = _mm_loadu_ps(ptrA+4); \
554         _t3          = _mm_loadu_ps(ptrA+8); \
555         _t1          = _mm_sub_ps(_t1, _t19); \
556         _t2          = _mm_sub_ps(_t2, _t21); \
557         _t3          = _mm_sub_ps(_t3, _t23); \
558         _mm_storeu_ps(ptrA, _t1); \
559         _mm_storeu_ps(ptrA+4, _t2); \
560         _mm_storeu_ps(ptrA+8, _t3); \
561         _t4          = _mm_loadu_ps(ptrB); \
562         _t5          = _mm_loadu_ps(ptrB+4); \
563         _t6          = _mm_loadu_ps(ptrB+8); \
564         _t4          = _mm_sub_ps(_t4, _z1); \
565         _t5          = _mm_sub_ps(_t5, _x3); \
566         _t6          = _mm_sub_ps(_t6, _y4); \
567         _mm_storeu_ps(ptrB, _t4); \
568         _mm_storeu_ps(ptrB+4, _t5); \
569         _mm_storeu_ps(ptrB+8, _t6); \
570         _t7          = _mm_loadu_ps(ptrC); \
571         _t8          = _mm_loadu_ps(ptrC+4); \
572         _t9          = _mm_loadu_ps(ptrC+8); \
573         _t7          = _mm_sub_ps(_t7, _t20); \
574         _t8          = _mm_sub_ps(_t8, _t22); \
575         _t9          = _mm_sub_ps(_t9, _t24); \
576         _mm_storeu_ps(ptrC, _t7); \
577         _mm_storeu_ps(ptrC+4, _t8); \
578         _mm_storeu_ps(ptrC+8, _t9); \
579         _t10         = _mm_loadu_ps(ptrD); \
580         _t11         = _mm_loadu_ps(ptrD+4); \
581         _t12         = _mm_loadu_ps(ptrD+8); \
582         _t10         = _mm_sub_ps(_t10, _t14); \
583         _t11         = _mm_sub_ps(_t11, _t16); \
584         _t12         = _mm_sub_ps(_t12, _t18); \
585         _mm_storeu_ps(ptrD, _t10); \
586         _mm_storeu_ps(ptrD+4, _t11); \
587         _mm_storeu_ps(ptrD+8, _t12); \
588     }
589 #else
590 /* Real function for sane compilers */
591 static void
592 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
593                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
594                                        __m128 x1, __m128 y1, __m128 z1,
595                                        __m128 x2, __m128 y2, __m128 z2,
596                                        __m128 x3, __m128 y3, __m128 z3,
597                                        __m128 x4, __m128 y4, __m128 z4)
598 {
599     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
600     __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
601     __m128 t23, t24;
602     t13         = _mm_unpackhi_ps(x1, y1);
603     x1          = _mm_unpacklo_ps(x1, y1);
604     t14         = _mm_unpackhi_ps(z1, x2);
605     z1          = _mm_unpacklo_ps(z1, x2);
606     t15         = _mm_unpackhi_ps(y2, z2);
607     y2          = _mm_unpacklo_ps(y2, z2);
608     t16         = _mm_unpackhi_ps(x3, y3);
609     x3          = _mm_unpacklo_ps(x3, y3);
610     t17         = _mm_unpackhi_ps(z3, x4);
611     z3          = _mm_unpacklo_ps(z3, x4);
612     t18         = _mm_unpackhi_ps(y4, z4);
613     y4          = _mm_unpacklo_ps(y4, z4);
614     t19         = _mm_movelh_ps(x1, z1);
615     z1          = _mm_movehl_ps(z1, x1);
616     t20         = _mm_movelh_ps(t13, t14);
617     t14         = _mm_movehl_ps(t14, t13);
618     t21         = _mm_movelh_ps(y2, x3);
619     x3          = _mm_movehl_ps(x3, y2);
620     t22         = _mm_movelh_ps(t15, t16);
621     t16         = _mm_movehl_ps(t16, t15);
622     t23         = _mm_movelh_ps(z3, y4);
623     y4          = _mm_movehl_ps(y4, z3);
624     t24         = _mm_movelh_ps(t17, t18);
625     t18         = _mm_movehl_ps(t18, t17);
626     t1          = _mm_loadu_ps(ptrA);
627     t2          = _mm_loadu_ps(ptrA+4);
628     t3          = _mm_loadu_ps(ptrA+8);
629     t1          = _mm_sub_ps(t1, t19);
630     t2          = _mm_sub_ps(t2, t21);
631     t3          = _mm_sub_ps(t3, t23);
632     _mm_storeu_ps(ptrA, t1);
633     _mm_storeu_ps(ptrA+4, t2);
634     _mm_storeu_ps(ptrA+8, t3);
635     t4          = _mm_loadu_ps(ptrB);
636     t5          = _mm_loadu_ps(ptrB+4);
637     t6          = _mm_loadu_ps(ptrB+8);
638     t4          = _mm_sub_ps(t4, z1);
639     t5          = _mm_sub_ps(t5, x3);
640     t6          = _mm_sub_ps(t6, y4);
641     _mm_storeu_ps(ptrB, t4);
642     _mm_storeu_ps(ptrB+4, t5);
643     _mm_storeu_ps(ptrB+8, t6);
644     t7          = _mm_loadu_ps(ptrC);
645     t8          = _mm_loadu_ps(ptrC+4);
646     t9          = _mm_loadu_ps(ptrC+8);
647     t7          = _mm_sub_ps(t7, t20);
648     t8          = _mm_sub_ps(t8, t22);
649     t9          = _mm_sub_ps(t9, t24);
650     _mm_storeu_ps(ptrC, t7);
651     _mm_storeu_ps(ptrC+4, t8);
652     _mm_storeu_ps(ptrC+8, t9);
653     t10         = _mm_loadu_ps(ptrD);
654     t11         = _mm_loadu_ps(ptrD+4);
655     t12         = _mm_loadu_ps(ptrD+8);
656     t10         = _mm_sub_ps(t10, t14);
657     t11         = _mm_sub_ps(t11, t16);
658     t12         = _mm_sub_ps(t12, t18);
659     _mm_storeu_ps(ptrD, t10);
660     _mm_storeu_ps(ptrD+4, t11);
661     _mm_storeu_ps(ptrD+8, t12);
662 }
663 #endif
664
665
666 static gmx_inline void
667 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
668                                       float * gmx_restrict fptr,
669                                       float * gmx_restrict fshiftptr)
670 {
671     __m128 t1, t2, t3;
672
673     /* transpose data */
674     t1 = fix1;
675     _MM_TRANSPOSE4_PS(fix1, t1, fiy1, fiz1);
676     fix1 = _mm_add_ps(_mm_add_ps(fix1, t1), _mm_add_ps(fiy1, fiz1));
677
678     t2 = _mm_load_ss(fptr);
679     t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
680     t3 = _mm_load_ss(fshiftptr);
681     t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
682
683     t2 = _mm_add_ps(t2, fix1);
684     t3 = _mm_add_ps(t3, fix1);
685
686     _mm_store_ss(fptr, t2);
687     _mm_storeh_pi((__m64 *)(fptr+1), t2);
688     _mm_store_ss(fshiftptr, t3);
689     _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
690 }
691
692 #if defined (_MSC_VER) && defined(_M_IX86)
693 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
694 #define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
695                                               fptr, fshiftptr) \
696     { \
697         __m128 _t1, _t2, _t3, _t4; \
698 \
699         _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
700         _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
701         _t2   = _mm_movehl_ps(_mm_setzero_ps(), fiz3); \
702         _t1   = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(0, 0, 0, 1)); \
703         _t3   = _mm_shuffle_ps(_t2, _t2, _MM_SHUFFLE(0, 0, 0, 1)); \
704         fix1  = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
705         fiy2  = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
706         fiz3  = _mm_add_ss(_mm_add_ps(fiz3, _t1), _mm_add_ps(_t2, _t3)); \
707         _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
708         _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
709         _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
710         _t4 = _mm_load_ss(fshiftptr+2); \
711         _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
712         _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
713         _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
714         _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
715         _t3 = _mm_shuffle_ps(_t3, _t3, _MM_SHUFFLE(1, 2, 0, 0)); \
716         _t1 = _mm_add_ps(_t1, _t2); \
717         _t3 = _mm_add_ps(_t3, _t4); \
718         _t1 = _mm_add_ps(_t1, _t3); \
719         _mm_store_ss(fshiftptr+2, _t1); \
720         _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
721     }
722 #else
723 /* Real function for sane compilers */
724 static gmx_inline void
725 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
726                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
727                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
728                                       float * gmx_restrict fptr,
729                                       float * gmx_restrict fshiftptr)
730 {
731     __m128 t1, t2, t3, t4;
732
733     /* transpose data */
734     _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2);
735     _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3);
736     t2   = _mm_movehl_ps(_mm_setzero_ps(), fiz3);
737     t1   = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(0, 0, 0, 1));
738     t3   = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 1));
739
740     fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2));
741     fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3));
742     fiz3 = _mm_add_ss(_mm_add_ps(fiz3, t1), _mm_add_ps(t2, t3));
743
744     _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
745     _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
746     _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));
747
748     t4 = _mm_load_ss(fshiftptr+2);
749     t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));
750
751     t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1  -   fiz3 */
752     t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3  -   fiz1 */
753     t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
754     t3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 2, 0, 0));     /* fiy2 fix2  -   fiz2 */
755
756     t1 = _mm_add_ps(t1, t2);
757     t3 = _mm_add_ps(t3, t4);
758     t1 = _mm_add_ps(t1, t3); /* y x - z */
759
760     _mm_store_ss(fshiftptr+2, t1);
761     _mm_storeh_pi((__m64 *)(fshiftptr), t1);
762 }
763 #endif
764
765 #if defined (_MSC_VER) && defined(_M_IX86)
766 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
767 #define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
768                                               fptr, fshiftptr) \
769     { \
770         __m128 _t1, _t2, _t3, _t4, _t5; \
771         _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
772         _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
773         _MM_TRANSPOSE4_PS(fiz3, fix4, fiy4, fiz4); \
774         fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
775         fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
776         fiz3 = _mm_add_ps(_mm_add_ps(fiz3, fix4), _mm_add_ps(fiy4, fiz4)); \
777         _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
778         _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
779         _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
780         _t5 = _mm_load_ss(fshiftptr+2); \
781         _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
782         _t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
783         _t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
784         _t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
785         _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
786         _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
787         _t1 = _mm_add_ps(_t1, _t2); \
788         _t3 = _mm_add_ps(_t3, _t4); \
789         _t1 = _mm_add_ps(_t1, _t3); \
790         _t5 = _mm_add_ps(_t5, _t1); \
791         _mm_store_ss(fshiftptr+2, _t5); \
792         _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
793     }
794 #else
795 /* Real function for sane compilers */
796 static gmx_inline void
797 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
798                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
799                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
800                                       __m128 fix4, __m128 fiy4, __m128 fiz4,
801                                       float * gmx_restrict fptr,
802                                       float * gmx_restrict fshiftptr)
803 {
804     __m128 t1, t2, t3, t4, t5;
805
806     /* transpose data */
807     _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2);
808     _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3);
809     _MM_TRANSPOSE4_PS(fiz3, fix4, fiy4, fiz4);
810
811     fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2));
812     fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3));
813     fiz3 = _mm_add_ps(_mm_add_ps(fiz3, fix4), _mm_add_ps(fiy4, fiz4));
814
815     _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
816     _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
817     _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));
818
819     t5 = _mm_load_ss(fshiftptr+2);
820     t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));
821
822     t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2));
823     t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1));
824     t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0));
825     t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
826     t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));
827
828     t1 = _mm_add_ps(t1, t2);
829     t3 = _mm_add_ps(t3, t4);
830     t1 = _mm_add_ps(t1, t3);
831     t5 = _mm_add_ps(t5, t1);
832
833     _mm_store_ss(fshiftptr+2, t5);
834     _mm_storeh_pi((__m64 *)(fshiftptr), t5);
835 }
836 #endif
837
838
839 static void
840 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
841 {
842     pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
843     pot1 = _mm_add_ps(pot1, _mm_shuffle_ps(pot1, pot1, _MM_SHUFFLE(0, 0, 0, 1)));
844     _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
845 }
846
847 static void
848 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
849                       __m128 pot2, float * gmx_restrict ptrB)
850 {
851     __m128 t1, t2;
852     t1   = _mm_movehl_ps(pot2, pot1);
853     t2   = _mm_movelh_ps(pot1, pot2);
854     t1   = _mm_add_ps(t1, t2);
855     t2   = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 1, 1));
856     pot1 = _mm_add_ps(t1, t2);
857     pot2 = _mm_movehl_ps(t2, pot1);
858     _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
859     _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
860 }
861
862
863 #endif /* _kernelutil_x86_sse2_single_h_ */