Code beautification with uncrustify
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / kernelutil_x86_avx_128_fma_single.h
1 /*
2  *                This source code is part of
3  *
4  *                 G   R   O   M   A   C   S
5  *
6  * Copyright (c) 2011-2012, The GROMACS Development Team
7  *
8  * Gromacs is a library for molecular simulation and trajectory analysis,
9  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10  * a full list of developers and information, check out http://www.gromacs.org
11  *
12  * This program is free software; you can redistribute it and/or modify it under
13  * the terms of the GNU Lesser General Public License as published by the Free
14  * Software Foundation; either version 2 of the License, or (at your option) any
15  * later version.
16  * As a special exception, you may use this file as part of a free software
17  * library without restriction.  Specifically, if other files instantiate
18  * templates or use macros or inline functions from this file, or you compile
19  * this file and link it with other files to produce an executable, this
20  * file does not by itself cause the resulting executable to be covered by
21  * the GNU Lesser General Public License.
22  *
23  * In plain-speak: do not worry about classes/macros/templates either - only
24  * changes to the library have to be LGPL, not an application linking with it.
25  *
26  * To help fund GROMACS development, we humbly ask that you cite
27  * the papers people have written on it - you can find them on the website!
28  */
29 #ifndef _kernelutil_x86_avx_128_fma_single_h_
30 #define _kernelutil_x86_avx_128_fma_single_h_
31
32
33 #include <math.h>
34
35 #include "gmx_x86_avx_128_fma.h"
36
37 /* Normal sum of four xmm registers */
38 #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
39
40 static gmx_inline int
41 gmx_mm_any_lt(__m128 a, __m128 b)
42 {
43     return _mm_movemask_ps(_mm_cmplt_ps(a, b));
44 }
45
46 static gmx_inline __m128
47 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
48 {
49     return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
50 }
51
52 /* Load a single value from 1-4 places, merge into xmm register */
53
54 static gmx_inline __m128
55 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
56                              const float * gmx_restrict ptrB,
57                              const float * gmx_restrict ptrC,
58                              const float * gmx_restrict ptrD)
59 {
60     __m128 t1, t2;
61
62     t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
63     t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
64     return _mm_unpacklo_ps(t1, t2);
65 }
66
67
68 static gmx_inline void
69 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
70                               float * gmx_restrict ptrB,
71                               float * gmx_restrict ptrC,
72                               float * gmx_restrict ptrD, __m128 xmm1)
73 {
74     __m128 t2, t3, t4;
75
76     t2       = _mm_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
77     t3       = _mm_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
78     t4       = _mm_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
79     _mm_store_ss(ptrA, xmm1);
80     _mm_store_ss(ptrB, t2);
81     _mm_store_ss(ptrC, t3);
82     _mm_store_ss(ptrD, t4);
83 }
84
85
86 static gmx_inline void
87 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
88                                   float * gmx_restrict ptrB,
89                                   float * gmx_restrict ptrC,
90                                   float * gmx_restrict ptrD, __m128 xmm1)
91 {
92     __m128 tmp;
93
94     tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
95     tmp = _mm_add_ps(tmp, xmm1);
96     gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
97 }
98
99
100 static gmx_inline void
101 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
102                              const float * gmx_restrict p2,
103                              const float * gmx_restrict p3,
104                              const float * gmx_restrict p4,
105                              __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
106 {
107     __m128 t1, t2, t3, t4;
108     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1);
109     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2);
110     t3   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3);
111     t4   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4);
112     t1   = _mm_unpacklo_ps(t1, t3);
113     t2   = _mm_unpacklo_ps(t2, t4);
114     *c6  = _mm_unpacklo_ps(t1, t2);
115     *c12 = _mm_unpackhi_ps(t1, t2);
116 }
117
118
119
120
121 static gmx_inline void
122 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
123                                          const float * gmx_restrict xyz,
124                                          __m128 * gmx_restrict      x1,
125                                          __m128 * gmx_restrict      y1,
126                                          __m128 * gmx_restrict      z1)
127 {
128     __m128 t1, t2, t3, t4;
129
130     t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
131     t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
132     t3   = _mm_load_ss(xyz_shift+2);
133     t4   = _mm_load_ss(xyz+2);
134     t1   = _mm_add_ps(t1, t2);
135     t3   = _mm_add_ss(t3, t4);
136
137     *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
138     *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
139     *z1  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
140 }
141
142
143 static gmx_inline void
144 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
145                                          const float * gmx_restrict xyz,
146                                          __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
147                                          __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
148                                          __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
149 {
150     __m128 tA, tB;
151     __m128 t1, t2, t3, t4, t5, t6;
152
153     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
154     tB   = _mm_load_ss(xyz_shift+2);
155
156     t1   = _mm_loadu_ps(xyz);
157     t2   = _mm_loadu_ps(xyz+4);
158     t3   = _mm_load_ss(xyz+8);
159
160     tA   = _mm_movelh_ps(tA, tB);
161     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
162     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
163     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
164
165     t1   = _mm_add_ps(t1, t4);
166     t2   = _mm_add_ps(t2, t5);
167     t3   = _mm_add_ss(t3, t6);
168
169     *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
170     *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
171     *z1  = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
172     *x2  = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
173     *y2  = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
174     *z2  = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
175     *x3  = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
176     *y3  = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
177     *z3  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
178 }
179
180
181 static gmx_inline void
182 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
183                                          const float * gmx_restrict xyz,
184                                          __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
185                                          __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
186                                          __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
187                                          __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
188 {
189     __m128 tA, tB;
190     __m128 t1, t2, t3, t4, t5, t6;
191
192     tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
193     tB   = _mm_load_ss(xyz_shift+2);
194
195     t1   = _mm_loadu_ps(xyz);
196     t2   = _mm_loadu_ps(xyz+4);
197     t3   = _mm_loadu_ps(xyz+8);
198
199     tA   = _mm_movelh_ps(tA, tB);
200     t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
201     t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
202     t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
203
204     t1   = _mm_add_ps(t1, t4);
205     t2   = _mm_add_ps(t2, t5);
206     t3   = _mm_add_ps(t3, t6);
207
208     *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
209     *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
210     *z1  = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
211     *x2  = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
212     *y2  = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
213     *z2  = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
214     *x3  = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
215     *y3  = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
216     *z3  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
217     *x4  = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
218     *y4  = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
219     *z4  = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
220 }
221
222
223 static gmx_inline void
224 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
225                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
226                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
227 {
228     __m128  t1, t2, t3, t4;
229     __m128i mask = _mm_set_epi32(0, -1, -1, -1);
230     t1             = gmx_mm_maskload_ps(ptrA, mask);
231     t2             = gmx_mm_maskload_ps(ptrB, mask);
232     t3             = gmx_mm_maskload_ps(ptrC, mask);
233     t4             = gmx_mm_maskload_ps(ptrD, mask);
234     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
235     *x1           = t1;
236     *y1           = t2;
237     *z1           = t3;
238 }
239
240
241 static gmx_inline void
242 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
243                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
244                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
245                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
246                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
247 {
248     __m128 t1, t2, t3, t4;
249     t1            = _mm_loadu_ps(ptrA);
250     t2            = _mm_loadu_ps(ptrB);
251     t3            = _mm_loadu_ps(ptrC);
252     t4            = _mm_loadu_ps(ptrD);
253     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
254     *x1           = t1;
255     *y1           = t2;
256     *z1           = t3;
257     *x2           = t4;
258     t1            = _mm_loadu_ps(ptrA+4);
259     t2            = _mm_loadu_ps(ptrB+4);
260     t3            = _mm_loadu_ps(ptrC+4);
261     t4            = _mm_loadu_ps(ptrD+4);
262     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
263     *y2           = t1;
264     *z2           = t2;
265     *x3           = t3;
266     *y3           = t4;
267     t1            = _mm_load_ss(ptrA+8);
268     t2            = _mm_load_ss(ptrB+8);
269     t3            = _mm_load_ss(ptrC+8);
270     t4            = _mm_load_ss(ptrD+8);
271     t1            = _mm_unpacklo_ps(t1, t3);
272     t3            = _mm_unpacklo_ps(t2, t4);
273     *z3           = _mm_unpacklo_ps(t1, t3);
274 }
275
276
277 static gmx_inline void
278 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
279                                   const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
280                                   __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
281                                   __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
282                                   __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
283                                   __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
284 {
285     __m128 t1, t2, t3, t4;
286     t1            = _mm_loadu_ps(ptrA);
287     t2            = _mm_loadu_ps(ptrB);
288     t3            = _mm_loadu_ps(ptrC);
289     t4            = _mm_loadu_ps(ptrD);
290     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
291     *x1           = t1;
292     *y1           = t2;
293     *z1           = t3;
294     *x2           = t4;
295     t1            = _mm_loadu_ps(ptrA+4);
296     t2            = _mm_loadu_ps(ptrB+4);
297     t3            = _mm_loadu_ps(ptrC+4);
298     t4            = _mm_loadu_ps(ptrD+4);
299     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
300     *y2           = t1;
301     *z2           = t2;
302     *x3           = t3;
303     *y3           = t4;
304     t1            = _mm_loadu_ps(ptrA+8);
305     t2            = _mm_loadu_ps(ptrB+8);
306     t3            = _mm_loadu_ps(ptrC+8);
307     t4            = _mm_loadu_ps(ptrD+8);
308     _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
309     *z3           = t1;
310     *x4           = t2;
311     *y4           = t3;
312     *z4           = t4;
313 }
314
315
316 static gmx_inline void
317 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
318                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
319                                        __m128 x1, __m128 y1, __m128 z1)
320 {
321     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
322     t5          = _mm_unpacklo_ps(y1, z1);
323     t6          = _mm_unpackhi_ps(y1, z1);
324     t7          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
325     t8          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
326     t9          = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
327     t10         = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
328     t1          = _mm_load_ss(ptrA);
329     t1          = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
330     t1          = _mm_sub_ps(t1, t7);
331     _mm_store_ss(ptrA, t1);
332     _mm_storeh_pi((__m64 *)(ptrA+1), t1);
333     t2          = _mm_load_ss(ptrB);
334     t2          = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
335     t2          = _mm_sub_ps(t2, t8);
336     _mm_store_ss(ptrB, t2);
337     _mm_storeh_pi((__m64 *)(ptrB+1), t2);
338     t3          = _mm_load_ss(ptrC);
339     t3          = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
340     t3          = _mm_sub_ps(t3, t9);
341     _mm_store_ss(ptrC, t3);
342     _mm_storeh_pi((__m64 *)(ptrC+1), t3);
343     t4          = _mm_load_ss(ptrD);
344     t4          = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
345     t4          = _mm_sub_ps(t4, t10);
346     _mm_store_ss(ptrD, t4);
347     _mm_storeh_pi((__m64 *)(ptrD+1), t4);
348 }
349
350
351 #if defined (_MSC_VER) && defined(_M_IX86)
352 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
353 #define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
354                                                _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
355     { \
356         __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
357         __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
358         __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
359         _t13         = _mm_unpackhi_ps(_x1, _y1); \
360         _x1          = _mm_unpacklo_ps(_x1, _y1); \
361         _t14         = _mm_unpackhi_ps(_z1, _x2); \
362         _z1          = _mm_unpacklo_ps(_z1, _x2); \
363         _t15         = _mm_unpackhi_ps(_y2, _z2); \
364         _y2          = _mm_unpacklo_ps(_y2, _z2); \
365         _t16         = _mm_unpackhi_ps(_x3, _y3); \
366         _x3          = _mm_unpacklo_ps(_x3, _y3); \
367         _t17         = _mm_permute_ps(_z3, _MM_SHUFFLE(0, 0, 0, 1)); \
368         _t18         = _mm_movehl_ps(_z3, _z3); \
369         _t19         = _mm_permute_ps(_t18, _MM_SHUFFLE(0, 0, 0, 1)); \
370         _t20         = _mm_movelh_ps(_x1, _z1); \
371         _t21         = _mm_movehl_ps(_z1, _x1); \
372         _t22         = _mm_movelh_ps(_t13, _t14); \
373         _t14         = _mm_movehl_ps(_t14, _t13); \
374         _t23         = _mm_movelh_ps(_y2, _x3); \
375         _t24         = _mm_movehl_ps(_x3, _y2); \
376         _t25         = _mm_movelh_ps(_t15, _t16); \
377         _t16         = _mm_movehl_ps(_t16, _t15); \
378         _t1          = _mm_loadu_ps(ptrA); \
379         _t2          = _mm_loadu_ps(ptrA+4); \
380         _t3          = _mm_load_ss(ptrA+8); \
381         _t1          = _mm_sub_ps(_t1, _t20); \
382         _t2          = _mm_sub_ps(_t2, _t23); \
383         _t3          = _mm_sub_ss(_t3, _z3); \
384         _mm_storeu_ps(ptrA, _t1); \
385         _mm_storeu_ps(ptrA+4, _t2); \
386         _mm_store_ss(ptrA+8, _t3); \
387         _t4          = _mm_loadu_ps(ptrB); \
388         _t5          = _mm_loadu_ps(ptrB+4); \
389         _t6          = _mm_load_ss(ptrB+8); \
390         _t4          = _mm_sub_ps(_t4, _t21); \
391         _t5          = _mm_sub_ps(_t5, _t24); \
392         _t6          = _mm_sub_ss(_t6, _t17); \
393         _mm_storeu_ps(ptrB, _t4); \
394         _mm_storeu_ps(ptrB+4, _t5); \
395         _mm_store_ss(ptrB+8, _t6); \
396         _t7          = _mm_loadu_ps(ptrC); \
397         _t8          = _mm_loadu_ps(ptrC+4); \
398         _t9          = _mm_load_ss(ptrC+8); \
399         _t7          = _mm_sub_ps(_t7, _t22); \
400         _t8          = _mm_sub_ps(_t8, _t25); \
401         _t9          = _mm_sub_ss(_t9, _t18); \
402         _mm_storeu_ps(ptrC, _t7); \
403         _mm_storeu_ps(ptrC+4, _t8); \
404         _mm_store_ss(ptrC+8, _t9); \
405         _t10         = _mm_loadu_ps(ptrD); \
406         _t11         = _mm_loadu_ps(ptrD+4); \
407         _t12         = _mm_load_ss(ptrD+8); \
408         _t10         = _mm_sub_ps(_t10, _t14); \
409         _t11         = _mm_sub_ps(_t11, _t16); \
410         _t12         = _mm_sub_ss(_t12, _t19); \
411         _mm_storeu_ps(ptrD, _t10); \
412         _mm_storeu_ps(ptrD+4, _t11); \
413         _mm_store_ss(ptrD+8, _t12); \
414     }
415 #else
416 /* Real function for sane compilers */
417 static gmx_inline void
418 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
419                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
420                                        __m128 x1, __m128 y1, __m128 z1,
421                                        __m128 x2, __m128 y2, __m128 z2,
422                                        __m128 x3, __m128 y3, __m128 z3)
423 {
424     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
425     __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
426     __m128 t20, t21, t22, t23, t24, t25;
427     t13         = _mm_unpackhi_ps(x1, y1);
428     x1          = _mm_unpacklo_ps(x1, y1);
429     t14         = _mm_unpackhi_ps(z1, x2);
430     z1          = _mm_unpacklo_ps(z1, x2);
431     t15         = _mm_unpackhi_ps(y2, z2);
432     y2          = _mm_unpacklo_ps(y2, z2);
433     t16         = _mm_unpackhi_ps(x3, y3);
434     x3          = _mm_unpacklo_ps(x3, y3);
435     t17         = _mm_permute_ps(z3, _MM_SHUFFLE(0, 0, 0, 1));
436     t18         = _mm_movehl_ps(z3, z3);
437     t19         = _mm_permute_ps(t18, _MM_SHUFFLE(0, 0, 0, 1));
438     t20         = _mm_movelh_ps(x1, z1);
439     t21         = _mm_movehl_ps(z1, x1);
440     t22         = _mm_movelh_ps(t13, t14);
441     t14         = _mm_movehl_ps(t14, t13);
442     t23         = _mm_movelh_ps(y2, x3);
443     t24         = _mm_movehl_ps(x3, y2);
444     t25         = _mm_movelh_ps(t15, t16);
445     t16         = _mm_movehl_ps(t16, t15);
446     t1          = _mm_loadu_ps(ptrA);
447     t2          = _mm_loadu_ps(ptrA+4);
448     t3          = _mm_load_ss(ptrA+8);
449     t1          = _mm_sub_ps(t1, t20);
450     t2          = _mm_sub_ps(t2, t23);
451     t3          = _mm_sub_ss(t3, z3);
452     _mm_storeu_ps(ptrA, t1);
453     _mm_storeu_ps(ptrA+4, t2);
454     _mm_store_ss(ptrA+8, t3);
455     t4          = _mm_loadu_ps(ptrB);
456     t5          = _mm_loadu_ps(ptrB+4);
457     t6          = _mm_load_ss(ptrB+8);
458     t4          = _mm_sub_ps(t4, t21);
459     t5          = _mm_sub_ps(t5, t24);
460     t6          = _mm_sub_ss(t6, t17);
461     _mm_storeu_ps(ptrB, t4);
462     _mm_storeu_ps(ptrB+4, t5);
463     _mm_store_ss(ptrB+8, t6);
464     t7          = _mm_loadu_ps(ptrC);
465     t8          = _mm_loadu_ps(ptrC+4);
466     t9          = _mm_load_ss(ptrC+8);
467     t7          = _mm_sub_ps(t7, t22);
468     t8          = _mm_sub_ps(t8, t25);
469     t9          = _mm_sub_ss(t9, t18);
470     _mm_storeu_ps(ptrC, t7);
471     _mm_storeu_ps(ptrC+4, t8);
472     _mm_store_ss(ptrC+8, t9);
473     t10         = _mm_loadu_ps(ptrD);
474     t11         = _mm_loadu_ps(ptrD+4);
475     t12         = _mm_load_ss(ptrD+8);
476     t10         = _mm_sub_ps(t10, t14);
477     t11         = _mm_sub_ps(t11, t16);
478     t12         = _mm_sub_ss(t12, t19);
479     _mm_storeu_ps(ptrD, t10);
480     _mm_storeu_ps(ptrD+4, t11);
481     _mm_store_ss(ptrD+8, t12);
482 }
483 #endif
484
485 #if defined (_MSC_VER) && defined(_M_IX86)
486 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
487 #define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
488                                                _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
489     { \
490         __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
491         __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
492         __m128 _t23, _t24; \
493         _t13         = _mm_unpackhi_ps(_x1, _y1); \
494         _x1          = _mm_unpacklo_ps(_x1, _y1); \
495         _t14         = _mm_unpackhi_ps(_z1, _x2); \
496         _z1          = _mm_unpacklo_ps(_z1, _x2); \
497         _t15         = _mm_unpackhi_ps(_y2, _z2); \
498         _y2          = _mm_unpacklo_ps(_y2, _z2); \
499         _t16         = _mm_unpackhi_ps(_x3, _y3); \
500         _x3          = _mm_unpacklo_ps(_x3, _y3); \
501         _t17         = _mm_unpackhi_ps(_z3, _x4); \
502         _z3          = _mm_unpacklo_ps(_z3, _x4); \
503         _t18         = _mm_unpackhi_ps(_y4, _z4); \
504         _y4          = _mm_unpacklo_ps(_y4, _z4); \
505         _t19         = _mm_movelh_ps(_x1, _z1); \
506         _z1          = _mm_movehl_ps(_z1, _x1); \
507         _t20         = _mm_movelh_ps(_t13, _t14); \
508         _t14         = _mm_movehl_ps(_t14, _t13); \
509         _t21         = _mm_movelh_ps(_y2, _x3); \
510         _x3          = _mm_movehl_ps(_x3, _y2); \
511         _t22         = _mm_movelh_ps(_t15, _t16); \
512         _t16         = _mm_movehl_ps(_t16, _t15); \
513         _t23         = _mm_movelh_ps(_z3, _y4); \
514         _y4          = _mm_movehl_ps(_y4, _z3); \
515         _t24         = _mm_movelh_ps(_t17, _t18); \
516         _t18         = _mm_movehl_ps(_t18, _t17); \
517         _t1          = _mm_loadu_ps(ptrA); \
518         _t2          = _mm_loadu_ps(ptrA+4); \
519         _t3          = _mm_loadu_ps(ptrA+8); \
520         _t1          = _mm_sub_ps(_t1, _t19); \
521         _t2          = _mm_sub_ps(_t2, _t21); \
522         _t3          = _mm_sub_ps(_t3, _t23); \
523         _mm_storeu_ps(ptrA, _t1); \
524         _mm_storeu_ps(ptrA+4, _t2); \
525         _mm_storeu_ps(ptrA+8, _t3); \
526         _t4          = _mm_loadu_ps(ptrB); \
527         _t5          = _mm_loadu_ps(ptrB+4); \
528         _t6          = _mm_loadu_ps(ptrB+8); \
529         _t4          = _mm_sub_ps(_t4, _z1); \
530         _t5          = _mm_sub_ps(_t5, _x3); \
531         _t6          = _mm_sub_ps(_t6, _y4); \
532         _mm_storeu_ps(ptrB, _t4); \
533         _mm_storeu_ps(ptrB+4, _t5); \
534         _mm_storeu_ps(ptrB+8, _t6); \
535         _t7          = _mm_loadu_ps(ptrC); \
536         _t8          = _mm_loadu_ps(ptrC+4); \
537         _t9          = _mm_loadu_ps(ptrC+8); \
538         _t7          = _mm_sub_ps(_t7, _t20); \
539         _t8          = _mm_sub_ps(_t8, _t22); \
540         _t9          = _mm_sub_ps(_t9, _t24); \
541         _mm_storeu_ps(ptrC, _t7); \
542         _mm_storeu_ps(ptrC+4, _t8); \
543         _mm_storeu_ps(ptrC+8, _t9); \
544         _t10         = _mm_loadu_ps(ptrD); \
545         _t11         = _mm_loadu_ps(ptrD+4); \
546         _t12         = _mm_loadu_ps(ptrD+8); \
547         _t10         = _mm_sub_ps(_t10, _t14); \
548         _t11         = _mm_sub_ps(_t11, _t16); \
549         _t12         = _mm_sub_ps(_t12, _t18); \
550         _mm_storeu_ps(ptrD, _t10); \
551         _mm_storeu_ps(ptrD+4, _t11); \
552         _mm_storeu_ps(ptrD+8, _t12); \
553     }
554 #else
555 /* Real function for sane compilers */
556 static gmx_inline void
557 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
558                                        float * gmx_restrict ptrC, float * gmx_restrict ptrD,
559                                        __m128 x1, __m128 y1, __m128 z1,
560                                        __m128 x2, __m128 y2, __m128 z2,
561                                        __m128 x3, __m128 y3, __m128 z3,
562                                        __m128 x4, __m128 y4, __m128 z4)
563 {
564     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
565     __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
566     __m128 t23, t24;
567     t13         = _mm_unpackhi_ps(x1, y1);
568     x1          = _mm_unpacklo_ps(x1, y1);
569     t14         = _mm_unpackhi_ps(z1, x2);
570     z1          = _mm_unpacklo_ps(z1, x2);
571     t15         = _mm_unpackhi_ps(y2, z2);
572     y2          = _mm_unpacklo_ps(y2, z2);
573     t16         = _mm_unpackhi_ps(x3, y3);
574     x3          = _mm_unpacklo_ps(x3, y3);
575     t17         = _mm_unpackhi_ps(z3, x4);
576     z3          = _mm_unpacklo_ps(z3, x4);
577     t18         = _mm_unpackhi_ps(y4, z4);
578     y4          = _mm_unpacklo_ps(y4, z4);
579     t19         = _mm_movelh_ps(x1, z1);
580     z1          = _mm_movehl_ps(z1, x1);
581     t20         = _mm_movelh_ps(t13, t14);
582     t14         = _mm_movehl_ps(t14, t13);
583     t21         = _mm_movelh_ps(y2, x3);
584     x3          = _mm_movehl_ps(x3, y2);
585     t22         = _mm_movelh_ps(t15, t16);
586     t16         = _mm_movehl_ps(t16, t15);
587     t23         = _mm_movelh_ps(z3, y4);
588     y4          = _mm_movehl_ps(y4, z3);
589     t24         = _mm_movelh_ps(t17, t18);
590     t18         = _mm_movehl_ps(t18, t17);
591     t1          = _mm_loadu_ps(ptrA);
592     t2          = _mm_loadu_ps(ptrA+4);
593     t3          = _mm_loadu_ps(ptrA+8);
594     t1          = _mm_sub_ps(t1, t19);
595     t2          = _mm_sub_ps(t2, t21);
596     t3          = _mm_sub_ps(t3, t23);
597     _mm_storeu_ps(ptrA, t1);
598     _mm_storeu_ps(ptrA+4, t2);
599     _mm_storeu_ps(ptrA+8, t3);
600     t4          = _mm_loadu_ps(ptrB);
601     t5          = _mm_loadu_ps(ptrB+4);
602     t6          = _mm_loadu_ps(ptrB+8);
603     t4          = _mm_sub_ps(t4, z1);
604     t5          = _mm_sub_ps(t5, x3);
605     t6          = _mm_sub_ps(t6, y4);
606     _mm_storeu_ps(ptrB, t4);
607     _mm_storeu_ps(ptrB+4, t5);
608     _mm_storeu_ps(ptrB+8, t6);
609     t7          = _mm_loadu_ps(ptrC);
610     t8          = _mm_loadu_ps(ptrC+4);
611     t9          = _mm_loadu_ps(ptrC+8);
612     t7          = _mm_sub_ps(t7, t20);
613     t8          = _mm_sub_ps(t8, t22);
614     t9          = _mm_sub_ps(t9, t24);
615     _mm_storeu_ps(ptrC, t7);
616     _mm_storeu_ps(ptrC+4, t8);
617     _mm_storeu_ps(ptrC+8, t9);
618     t10         = _mm_loadu_ps(ptrD);
619     t11         = _mm_loadu_ps(ptrD+4);
620     t12         = _mm_loadu_ps(ptrD+8);
621     t10         = _mm_sub_ps(t10, t14);
622     t11         = _mm_sub_ps(t11, t16);
623     t12         = _mm_sub_ps(t12, t18);
624     _mm_storeu_ps(ptrD, t10);
625     _mm_storeu_ps(ptrD+4, t11);
626     _mm_storeu_ps(ptrD+8, t12);
627 }
628 #endif
629
630 static gmx_inline void
631 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
632                                       float * gmx_restrict fptr,
633                                       float * gmx_restrict fshiftptr)
634 {
635     __m128 t2, t3;
636
637     fix1 = _mm_hadd_ps(fix1, fix1);
638     fiy1 = _mm_hadd_ps(fiy1, fiz1);
639
640     fix1 = _mm_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 */
641
642     t2 = _mm_load_ss(fptr);
643     t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
644     t3 = _mm_load_ss(fshiftptr);
645     t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
646
647     t2 = _mm_add_ps(t2, fix1);
648     t3 = _mm_add_ps(t3, fix1);
649
650     _mm_store_ss(fptr, t2);
651     _mm_storeh_pi((__m64 *)(fptr+1), t2);
652     _mm_store_ss(fshiftptr, t3);
653     _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
654 }
655
656 #if defined (_MSC_VER) && defined(_M_IX86)
657 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
658 #define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
659                                               fptr, fshiftptr) \
660     { \
661         __m128 _t1, _t2, _t3, _t4; \
662 \
663         fix1 = _mm_hadd_ps(fix1, fiy1); \
664         fiz1 = _mm_hadd_ps(fiz1, fix2); \
665         fiy2 = _mm_hadd_ps(fiy2, fiz2); \
666         fix3 = _mm_hadd_ps(fix3, fiy3); \
667         fiz3 = _mm_hadd_ps(fiz3, fiz3); \
668         fix1 = _mm_hadd_ps(fix1, fiz1); \
669         fiy2 = _mm_hadd_ps(fiy2, fix3); \
670         fiz3 = _mm_hadd_ps(fiz3, fiz3); \
671         _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
672         _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
673         _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
674         _t4 = _mm_load_ss(fshiftptr+2); \
675         _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
676         _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
677         _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
678         _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
679         _t3 = _mm_permute_ps(_t3, _MM_SHUFFLE(1, 2, 0, 0)); \
680         _t1 = _mm_add_ps(_t1, _t2); \
681         _t3 = _mm_add_ps(_t3, _t4); \
682         _t1 = _mm_add_ps(_t1, _t3); \
683         _mm_store_ss(fshiftptr+2, _t1); \
684         _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
685     }
686 #else
687 /* Real function for sane compilers */
688 static gmx_inline void
689 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
690                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
691                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
692                                       float * gmx_restrict fptr,
693                                       float * gmx_restrict fshiftptr)
694 {
695     __m128 t1, t2, t3, t4;
696
697     fix1 = _mm_hadd_ps(fix1, fiy1);
698     fiz1 = _mm_hadd_ps(fiz1, fix2);
699     fiy2 = _mm_hadd_ps(fiy2, fiz2);
700     fix3 = _mm_hadd_ps(fix3, fiy3);
701     fiz3 = _mm_hadd_ps(fiz3, fiz3);
702
703     fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
704     fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
705     fiz3 = _mm_hadd_ps(fiz3, fiz3); /*  -    -    -   fiz3 */
706
707     _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
708     _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
709     _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));
710
711     t4 = _mm_load_ss(fshiftptr+2);
712     t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));
713
714     t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1  -   fiz3 */
715     t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3  -   fiz1 */
716     t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
717     t3 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 2, 0, 0));         /* fiy2 fix2  -   fiz2 */
718
719     t1 = _mm_add_ps(t1, t2);
720     t3 = _mm_add_ps(t3, t4);
721     t1 = _mm_add_ps(t1, t3); /* y x - z */
722
723     _mm_store_ss(fshiftptr+2, t1);
724     _mm_storeh_pi((__m64 *)(fshiftptr), t1);
725 }
726 #endif
727
728 #if defined (_MSC_VER) && defined(_M_IX86)
729 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
730 #define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
731                                               fptr, fshiftptr) \
732     { \
733         __m128 _t1, _t2, _t3, _t4, _t5; \
734 \
735         fix1 = _mm_hadd_ps(fix1, fiy1); \
736         fiz1 = _mm_hadd_ps(fiz1, fix2); \
737         fiy2 = _mm_hadd_ps(fiy2, fiz2); \
738         fix3 = _mm_hadd_ps(fix3, fiy3); \
739         fiz3 = _mm_hadd_ps(fiz3, fix4); \
740         fiy4 = _mm_hadd_ps(fiy4, fiz4); \
741         fix1 = _mm_hadd_ps(fix1, fiz1); \
742         fiy2 = _mm_hadd_ps(fiy2, fix3); \
743         fiz3 = _mm_hadd_ps(fiz3, fiy4); \
744         _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  )); \
745         _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
746         _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
747         _t5 = _mm_load_ss(fshiftptr+2); \
748         _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
749         _t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
750         _t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
751         _t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
752         _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
753         _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
754         _t1 = _mm_add_ps(_t1, _t2); \
755         _t3 = _mm_add_ps(_t3, _t4); \
756         _t1 = _mm_add_ps(_t1, _t3); \
757         _t5 = _mm_add_ps(_t5, _t1); \
758         _mm_store_ss(fshiftptr+2, _t5); \
759         _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
760     }
761 #else
762 /* Real function for sane compilers */
763 static gmx_inline void
764 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
765                                       __m128 fix2, __m128 fiy2, __m128 fiz2,
766                                       __m128 fix3, __m128 fiy3, __m128 fiz3,
767                                       __m128 fix4, __m128 fiy4, __m128 fiz4,
768                                       float * gmx_restrict fptr,
769                                       float * gmx_restrict fshiftptr)
770 {
771     __m128 t1, t2, t3, t4, t5;
772
773     fix1 = _mm_hadd_ps(fix1, fiy1);
774     fiz1 = _mm_hadd_ps(fiz1, fix2);
775     fiy2 = _mm_hadd_ps(fiy2, fiz2);
776     fix3 = _mm_hadd_ps(fix3, fiy3);
777     fiz3 = _mm_hadd_ps(fiz3, fix4);
778     fiy4 = _mm_hadd_ps(fiy4, fiz4);
779
780     fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
781     fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
782     fiz3 = _mm_hadd_ps(fiz3, fiy4); /* fiz4 fiy4 fix4 fiz3 */
783
784     _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
785     _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
786     _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));
787
788     t5 = _mm_load_ss(fshiftptr+2);
789     t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));
790
791     t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2));
792     t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1));
793     t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0));
794     t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
795     t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));
796
797     t1 = _mm_add_ps(t1, t2);
798     t3 = _mm_add_ps(t3, t4);
799     t1 = _mm_add_ps(t1, t3);
800     t5 = _mm_add_ps(t5, t1);
801
802     _mm_store_ss(fshiftptr+2, t5);
803     _mm_storeh_pi((__m64 *)(fshiftptr), t5);
804 }
805 #endif
806
807
808 static gmx_inline void
809 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
810 {
811     pot1 = _mm_hadd_ps(pot1, pot1);
812     pot1 = _mm_hadd_ps(pot1, pot1);
813     _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
814 }
815
816 static gmx_inline void
817 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
818                       __m128 pot2, float * gmx_restrict ptrB)
819 {
820     pot1 = _mm_hadd_ps(pot1, pot2);
821     pot1 = _mm_hadd_ps(pot1, pot1);
822     pot2 = _mm_permute_ps(pot1, _MM_SHUFFLE(0, 0, 0, 1));
823     _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
824     _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
825 }
826
827
828 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */