2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_x86_sse2_single_h_
30 #define _kernelutil_x86_sse2_single_h_
32 /* We require SSE2 now! */
36 #include "gmx_x86_sse2.h"
39 /* Normal sum of four xmm registers */
40 #define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
42 static gmx_inline __m128
43 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
45 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
49 gmx_mm_any_lt(__m128 a, __m128 b)
51 return _mm_movemask_ps(_mm_cmplt_ps(a,b));
54 /* Load a single value from 1-4 places, merge into xmm register */
57 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
58 const float * gmx_restrict ptrB,
59 const float * gmx_restrict ptrC,
60 const float * gmx_restrict ptrD)
64 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
65 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
66 return _mm_unpacklo_ps(t1,t2);
70 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
71 float * gmx_restrict ptrB,
72 float * gmx_restrict ptrC,
73 float * gmx_restrict ptrD,
78 t3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1);
79 t2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
80 t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
81 _mm_store_ss(ptrA,xmm1);
82 _mm_store_ss(ptrB,t2);
83 _mm_store_ss(ptrC,t3);
84 _mm_store_ss(ptrD,t4);
87 /* Similar to store, but increments value in memory */
89 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
90 float * gmx_restrict ptrB,
91 float * gmx_restrict ptrC,
92 float * gmx_restrict ptrD, __m128 xmm1)
96 tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
97 tmp = _mm_add_ps(tmp,xmm1);
98 gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
103 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
104 const float * gmx_restrict p2,
105 const float * gmx_restrict p3,
106 const float * gmx_restrict p4,
107 __m128 * gmx_restrict c6,
108 __m128 * gmx_restrict c12)
112 t1 = _mm_castpd_ps(_mm_load_sd((const double *)p1));
113 t2 = _mm_castpd_ps(_mm_load_sd((const double *)p2));
114 t3 = _mm_castpd_ps(_mm_load_sd((const double *)p3));
115 t4 = _mm_castpd_ps(_mm_load_sd((const double *)p4));
116 t1 = _mm_unpacklo_ps(t1,t2);
117 t2 = _mm_unpacklo_ps(t3,t4);
118 *c6 = _mm_movelh_ps(t1,t2);
119 *c12 = _mm_movehl_ps(t2,t1);
122 /* Routines to load 1-4 rvec from 4 places.
123 * We mainly use these to load coordinates. The extra routines
124 * are very efficient for the water-water loops, since we e.g.
125 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
130 gmx_mm_load_1rvec_broadcast_ps(float *ptrA, __m128 *x, __m128 *y, __m128 *z)
134 t1 = _mm_loadu_ps(ptrA);
136 *x = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
137 *y = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
138 *z = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
142 gmx_mm_load_3rvec_broadcast_ps(float *ptrA,
143 __m128 *x1, __m128 *y1, __m128 *z1,
144 __m128 *x2, __m128 *y2, __m128 *z2,
145 __m128 *x3, __m128 *y3, __m128 *z3)
149 t1 = _mm_loadu_ps(ptrA);
150 t2 = _mm_loadu_ps(ptrA+4);
152 *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
153 *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
154 *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
155 *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
156 *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
157 *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
158 *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
159 *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
161 t3 = _mm_load_ss(ptrA+8);
162 *z3 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
166 gmx_mm_load_4rvec_broadcast_ps(float *ptrA,
167 __m128 *x1, __m128 *y1, __m128 *z1,
168 __m128 *x2, __m128 *y2, __m128 *z2,
169 __m128 *x3, __m128 *y3, __m128 *z3,
170 __m128 *x4, __m128 *y4, __m128 *z4)
175 t1 = _mm_loadu_ps(ptrA);
176 t2 = _mm_loadu_ps(ptrA+4);
177 t3 = _mm_loadu_ps(ptrA+8);
179 *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
180 *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
181 *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
182 *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
183 *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
184 *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
185 *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
186 *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
187 *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
188 *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
189 *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
190 *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
195 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
196 const float * gmx_restrict ptrB,
197 const float * gmx_restrict ptrC,
198 const float * gmx_restrict ptrD,
199 __m128 * gmx_restrict x1,
200 __m128 * gmx_restrict y1,
201 __m128 * gmx_restrict z1)
203 __m128 t1,t2,t3,t4,t5,t6,t7,t8;
204 t1 = _mm_castpd_ps(_mm_load_sd((const double *)ptrA));
205 t2 = _mm_castpd_ps(_mm_load_sd((const double *)ptrB));
206 t3 = _mm_castpd_ps(_mm_load_sd((const double *)ptrC));
207 t4 = _mm_castpd_ps(_mm_load_sd((const double *)ptrD));
208 t5 = _mm_load_ss(ptrA+2);
209 t6 = _mm_load_ss(ptrB+2);
210 t7 = _mm_load_ss(ptrC+2);
211 t8 = _mm_load_ss(ptrD+2);
212 t1 = _mm_unpacklo_ps(t1,t2);
213 t3 = _mm_unpacklo_ps(t3,t4);
214 *x1 = _mm_movelh_ps(t1,t3);
215 *y1 = _mm_movehl_ps(t3,t1);
216 t5 = _mm_unpacklo_ps(t5,t6);
217 t7 = _mm_unpacklo_ps(t7,t8);
218 *z1 = _mm_movelh_ps(t5,t7);
223 gmx_mm_load_2rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
224 __m128 *x1, __m128 *y1, __m128 *z1,
225 __m128 *x2, __m128 *y2, __m128 *z2)
228 t1 = _mm_loadu_ps(ptrA);
229 t2 = _mm_loadu_ps(ptrB);
230 t3 = _mm_loadu_ps(ptrC);
231 t4 = _mm_loadu_ps(ptrD);
232 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
237 t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrA+4));
238 t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrB+4));
239 t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrC+4));
240 t4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrD+4));
241 t1 = _mm_unpacklo_ps(t1,t3);
242 t2 = _mm_unpacklo_ps(t2,t4);
243 *y2 = _mm_unpacklo_ps(t1,t2);
244 *z2 = _mm_unpackhi_ps(t1,t2);
249 gmx_mm_load_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
250 __m128 *x1, __m128 *y1, __m128 *z1,
251 __m128 *x2, __m128 *y2, __m128 *z2,
252 __m128 *x3, __m128 *y3, __m128 *z3)
255 t1 = _mm_loadu_ps(ptrA);
256 t2 = _mm_loadu_ps(ptrB);
257 t3 = _mm_loadu_ps(ptrC);
258 t4 = _mm_loadu_ps(ptrD);
259 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
264 t1 = _mm_loadu_ps(ptrA+4);
265 t2 = _mm_loadu_ps(ptrB+4);
266 t3 = _mm_loadu_ps(ptrC+4);
267 t4 = _mm_loadu_ps(ptrD+4);
268 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
273 t1 = _mm_load_ss(ptrA+8);
274 t2 = _mm_load_ss(ptrB+8);
275 t3 = _mm_load_ss(ptrC+8);
276 t4 = _mm_load_ss(ptrD+8);
277 t1 = _mm_unpacklo_ps(t1,t3);
278 t3 = _mm_unpacklo_ps(t2,t4);
279 *z3 = _mm_unpacklo_ps(t1,t3);
284 gmx_mm_load_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
285 __m128 *x1, __m128 *y1, __m128 *z1,
286 __m128 *x2, __m128 *y2, __m128 *z2,
287 __m128 *x3, __m128 *y3, __m128 *z3,
288 __m128 *x4, __m128 *y4, __m128 *z4)
291 t1 = _mm_loadu_ps(ptrA);
292 t2 = _mm_loadu_ps(ptrB);
293 t3 = _mm_loadu_ps(ptrC);
294 t4 = _mm_loadu_ps(ptrD);
295 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
300 t1 = _mm_loadu_ps(ptrA+4);
301 t2 = _mm_loadu_ps(ptrB+4);
302 t3 = _mm_loadu_ps(ptrC+4);
303 t4 = _mm_loadu_ps(ptrD+4);
304 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
309 t1 = _mm_loadu_ps(ptrA+8);
310 t2 = _mm_loadu_ps(ptrB+8);
311 t3 = _mm_loadu_ps(ptrC+8);
312 t4 = _mm_loadu_ps(ptrD+8);
313 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
321 /* Routines to increment rvec in memory, typically use for j particle force updates */
323 gmx_mm_increment_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
325 __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
328 t1 = _mm_loadu_ps(ptrA);
329 xyz = _mm_and_ps(mask,xyz);
330 t1 = _mm_add_ps(t1,xyz);
331 _mm_storeu_ps(ptrA,t1);
336 gmx_mm_increment_3rvec_1ptr_noswizzle_ps(float *ptrA,
337 __m128 xyz1, __m128 xyz2, __m128 xyz3)
342 tA = _mm_loadu_ps(ptrA);
343 tB = _mm_loadu_ps(ptrA+4);
344 tC = _mm_load_ss(ptrA+8);
346 t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
347 t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
349 t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
350 t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
352 t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
354 tA = _mm_add_ps(tA,t3);
355 tB = _mm_add_ps(tB,t4);
356 tC = _mm_add_ss(tC,t2);
358 _mm_storeu_ps(ptrA,tA);
359 _mm_storeu_ps(ptrA+4,tB);
360 _mm_store_ss(ptrA+8,tC);
364 gmx_mm_increment_4rvec_1ptr_noswizzle_ps(float *ptrA,
365 __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
367 __m128 t1,t2,t3,t4,t5;
370 tA = _mm_loadu_ps(ptrA);
371 tB = _mm_loadu_ps(ptrA+4);
372 tC = _mm_loadu_ps(ptrA+8);
374 t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
375 t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
377 t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
378 t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
380 t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
381 t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
383 t2 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0)); /* x4 x4 z3 z3 */
384 t5 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
386 tA = _mm_add_ps(tA,t3);
387 tB = _mm_add_ps(tB,t4);
388 tC = _mm_add_ps(tC,t5);
390 _mm_storeu_ps(ptrA,tA);
391 _mm_storeu_ps(ptrA+4,tB);
392 _mm_storeu_ps(ptrA+8,tC);
399 gmx_mm_increment_1rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC,float *ptrD,
400 __m128 x1, __m128 y1, __m128 z1)
402 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
403 t5 = _mm_unpacklo_ps(y1,z1);
404 t6 = _mm_unpackhi_ps(y1,z1);
405 t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
406 t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
407 t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
408 t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
409 t1 = _mm_load_ss(ptrA);
410 t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
411 t1 = _mm_add_ps(t1,t7);
412 _mm_store_ss(ptrA,t1);
413 _mm_storeh_pi((__m64 *)(ptrA+1),t1);
414 t2 = _mm_load_ss(ptrB);
415 t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
416 t2 = _mm_add_ps(t2,t8);
417 _mm_store_ss(ptrB,t2);
418 _mm_storeh_pi((__m64 *)(ptrB+1),t2);
419 t3 = _mm_load_ss(ptrC);
420 t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
421 t3 = _mm_add_ps(t3,t9);
422 _mm_store_ss(ptrC,t3);
423 _mm_storeh_pi((__m64 *)(ptrC+1),t3);
424 t4 = _mm_load_ss(ptrD);
425 t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
426 t4 = _mm_add_ps(t4,t10);
427 _mm_store_ss(ptrD,t4);
428 _mm_storeh_pi((__m64 *)(ptrD+1),t4);
435 gmx_mm_increment_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
436 __m128 x1, __m128 y1, __m128 z1,
437 __m128 x2, __m128 y2, __m128 z2,
438 __m128 x3, __m128 y3, __m128 z3)
440 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
441 __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
442 __m128 t20,t21,t22,t23,t24,t25;
444 t13 = _mm_unpackhi_ps(x1,y1);
445 x1 = _mm_unpacklo_ps(x1,y1);
446 t14 = _mm_unpackhi_ps(z1,x2);
447 z1 = _mm_unpacklo_ps(z1,x2);
448 t15 = _mm_unpackhi_ps(y2,z2);
449 y2 = _mm_unpacklo_ps(y2,z2);
450 t16 = _mm_unpackhi_ps(x3,y3);
451 x3 = _mm_unpacklo_ps(x3,y3);
452 t17 = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
453 t18 = _mm_movehl_ps(z3,z3);
454 t19 = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
455 t20 = _mm_movelh_ps(x1,z1);
456 t21 = _mm_movehl_ps(z1,x1);
457 t22 = _mm_movelh_ps(t13,t14);
458 t14 = _mm_movehl_ps(t14,t13);
459 t23 = _mm_movelh_ps(y2,x3);
460 t24 = _mm_movehl_ps(x3,y2);
461 t25 = _mm_movelh_ps(t15,t16);
462 t16 = _mm_movehl_ps(t16,t15);
463 t1 = _mm_loadu_ps(ptrA);
464 t2 = _mm_loadu_ps(ptrA+4);
465 t3 = _mm_load_ss(ptrA+8);
466 t1 = _mm_add_ps(t1,t20);
467 t2 = _mm_add_ps(t2,t23);
468 t3 = _mm_add_ss(t3,z3);
469 _mm_storeu_ps(ptrA,t1);
470 _mm_storeu_ps(ptrA+4,t2);
471 _mm_store_ss(ptrA+8,t3);
472 t4 = _mm_loadu_ps(ptrB);
473 t5 = _mm_loadu_ps(ptrB+4);
474 t6 = _mm_load_ss(ptrB+8);
475 t4 = _mm_add_ps(t4,t21);
476 t5 = _mm_add_ps(t5,t24);
477 t6 = _mm_add_ss(t6,t17);
478 _mm_storeu_ps(ptrB,t4);
479 _mm_storeu_ps(ptrB+4,t5);
480 _mm_store_ss(ptrB+8,t6);
481 t7 = _mm_loadu_ps(ptrC);
482 t8 = _mm_loadu_ps(ptrC+4);
483 t9 = _mm_load_ss(ptrC+8);
484 t7 = _mm_add_ps(t7,t22);
485 t8 = _mm_add_ps(t8,t25);
486 t9 = _mm_add_ss(t9,t18);
487 _mm_storeu_ps(ptrC,t7);
488 _mm_storeu_ps(ptrC+4,t8);
489 _mm_store_ss(ptrC+8,t9);
490 t10 = _mm_loadu_ps(ptrD);
491 t11 = _mm_loadu_ps(ptrD+4);
492 t12 = _mm_load_ss(ptrD+8);
493 t10 = _mm_add_ps(t10,t14);
494 t11 = _mm_add_ps(t11,t16);
495 t12 = _mm_add_ss(t12,t19);
496 _mm_storeu_ps(ptrD,t10);
497 _mm_storeu_ps(ptrD+4,t11);
498 _mm_store_ss(ptrD+8,t12);
503 gmx_mm_increment_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
504 __m128 x1, __m128 y1, __m128 z1,
505 __m128 x2, __m128 y2, __m128 z2,
506 __m128 x3, __m128 y3, __m128 z3,
507 __m128 x4, __m128 y4, __m128 z4)
509 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
510 __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
512 t13 = _mm_unpackhi_ps(x1,y1);
513 x1 = _mm_unpacklo_ps(x1,y1);
514 t14 = _mm_unpackhi_ps(z1,x2);
515 z1 = _mm_unpacklo_ps(z1,x2);
516 t15 = _mm_unpackhi_ps(y2,z2);
517 y2 = _mm_unpacklo_ps(y2,z2);
518 t16 = _mm_unpackhi_ps(x3,y3);
519 x3 = _mm_unpacklo_ps(x3,y3);
520 t17 = _mm_unpackhi_ps(z3,x4);
521 z3 = _mm_unpacklo_ps(z3,x4);
522 t18 = _mm_unpackhi_ps(y4,z4);
523 y4 = _mm_unpacklo_ps(y4,z4);
524 t19 = _mm_movelh_ps(x1,z1);
525 z1 = _mm_movehl_ps(z1,x1);
526 t20 = _mm_movelh_ps(t13,t14);
527 t14 = _mm_movehl_ps(t14,t13);
528 t21 = _mm_movelh_ps(y2,x3);
529 x3 = _mm_movehl_ps(x3,y2);
530 t22 = _mm_movelh_ps(t15,t16);
531 t16 = _mm_movehl_ps(t16,t15);
532 t23 = _mm_movelh_ps(z3,y4);
533 y4 = _mm_movehl_ps(y4,z3);
534 t24 = _mm_movelh_ps(t17,t18);
535 t18 = _mm_movehl_ps(t18,t17);
536 t1 = _mm_loadu_ps(ptrA);
537 t2 = _mm_loadu_ps(ptrA+4);
538 t3 = _mm_loadu_ps(ptrA+8);
539 t1 = _mm_add_ps(t1,t19);
540 t2 = _mm_add_ps(t2,t21);
541 t3 = _mm_add_ps(t3,t23);
542 _mm_storeu_ps(ptrA,t1);
543 _mm_storeu_ps(ptrA+4,t2);
544 _mm_storeu_ps(ptrA+8,t3);
545 t4 = _mm_loadu_ps(ptrB);
546 t5 = _mm_loadu_ps(ptrB+4);
547 t6 = _mm_loadu_ps(ptrB+8);
548 t4 = _mm_add_ps(t4,z1);
549 t5 = _mm_add_ps(t5,x3);
550 t6 = _mm_add_ps(t6,y4);
551 _mm_storeu_ps(ptrB,t4);
552 _mm_storeu_ps(ptrB+4,t5);
553 _mm_storeu_ps(ptrB+8,t6);
554 t7 = _mm_loadu_ps(ptrC);
555 t8 = _mm_loadu_ps(ptrC+4);
556 t9 = _mm_loadu_ps(ptrC+8);
557 t7 = _mm_add_ps(t7,t20);
558 t8 = _mm_add_ps(t8,t22);
559 t9 = _mm_add_ps(t9,t24);
560 _mm_storeu_ps(ptrC,t7);
561 _mm_storeu_ps(ptrC+4,t8);
562 _mm_storeu_ps(ptrC+8,t9);
563 t10 = _mm_loadu_ps(ptrD);
564 t11 = _mm_loadu_ps(ptrD+4);
565 t12 = _mm_loadu_ps(ptrD+8);
566 t10 = _mm_add_ps(t10,t14);
567 t11 = _mm_add_ps(t11,t16);
568 t12 = _mm_add_ps(t12,t18);
569 _mm_storeu_ps(ptrD,t10);
570 _mm_storeu_ps(ptrD+4,t11);
571 _mm_storeu_ps(ptrD+8,t12);
575 /* Routines to decrement rvec in memory */
577 gmx_mm_decrement_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
579 __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
582 t1 = _mm_loadu_ps(ptrA);
583 xyz = _mm_and_ps(mask,xyz);
584 t1 = _mm_sub_ps(t1,xyz);
585 _mm_storeu_ps(ptrA,t1);
590 gmx_mm_decrement_3rvec_1ptr_noswizzle_ps(float *ptrA,
591 __m128 xyz1, __m128 xyz2, __m128 xyz3)
596 tA = _mm_loadu_ps(ptrA);
597 tB = _mm_loadu_ps(ptrA+4);
598 tC = _mm_load_ss(ptrA+8);
600 t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
601 t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
603 t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
604 t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
606 t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
608 tA = _mm_sub_ps(tA,t3);
609 tB = _mm_sub_ps(tB,t4);
610 tC = _mm_sub_ss(tC,t2);
612 _mm_storeu_ps(ptrA,tA);
613 _mm_storeu_ps(ptrA+4,tB);
614 _mm_store_ss(ptrA+8,tC);
618 gmx_mm_decrement_4rvec_1ptr_noswizzle_ps(float *ptrA,
619 __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
621 __m128 t1,t2,t3,t4,t5;
624 tA = _mm_loadu_ps(ptrA);
625 tB = _mm_loadu_ps(ptrA+4);
626 tC = _mm_loadu_ps(ptrA+8);
628 t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
629 t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
631 t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
632 t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
634 t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
636 t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
637 t2 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0)); /* x4 x4 z3 z3 */
638 t5 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
640 tA = _mm_sub_ps(tA,t3);
641 tB = _mm_sub_ps(tB,t4);
642 tC = _mm_sub_ps(tC,t5);
644 _mm_storeu_ps(ptrA,tA);
645 _mm_storeu_ps(ptrA+4,tB);
646 _mm_storeu_ps(ptrA+8,tC);
652 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
653 float * gmx_restrict ptrB,
654 float * gmx_restrict ptrC,
655 float * gmx_restrict ptrD,
656 __m128 x1, __m128 y1, __m128 z1)
658 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
659 t5 = _mm_unpacklo_ps(y1,z1);
660 t6 = _mm_unpackhi_ps(y1,z1);
661 t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
662 t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
663 t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
664 t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
665 t1 = _mm_load_ss(ptrA);
666 t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
667 t1 = _mm_sub_ps(t1,t7);
668 _mm_store_ss(ptrA,t1);
669 _mm_storeh_pi((__m64 *)(ptrA+1),t1);
670 t2 = _mm_load_ss(ptrB);
671 t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
672 t2 = _mm_sub_ps(t2,t8);
673 _mm_store_ss(ptrB,t2);
674 _mm_storeh_pi((__m64 *)(ptrB+1),t2);
675 t3 = _mm_load_ss(ptrC);
676 t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
677 t3 = _mm_sub_ps(t3,t9);
678 _mm_store_ss(ptrC,t3);
679 _mm_storeh_pi((__m64 *)(ptrC+1),t3);
680 t4 = _mm_load_ss(ptrD);
681 t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
682 t4 = _mm_sub_ps(t4,t10);
683 _mm_store_ss(ptrD,t4);
684 _mm_storeh_pi((__m64 *)(ptrD+1),t4);
690 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
691 __m128 x1, __m128 y1, __m128 z1,
692 __m128 x2, __m128 y2, __m128 z2,
693 __m128 x3, __m128 y3, __m128 z3)
695 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
696 __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
697 __m128 t20,t21,t22,t23,t24,t25;
699 t13 = _mm_unpackhi_ps(x1,y1);
700 x1 = _mm_unpacklo_ps(x1,y1);
701 t14 = _mm_unpackhi_ps(z1,x2);
702 z1 = _mm_unpacklo_ps(z1,x2);
703 t15 = _mm_unpackhi_ps(y2,z2);
704 y2 = _mm_unpacklo_ps(y2,z2);
705 t16 = _mm_unpackhi_ps(x3,y3);
706 x3 = _mm_unpacklo_ps(x3,y3);
707 t17 = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
708 t18 = _mm_movehl_ps(z3,z3);
709 t19 = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
710 t20 = _mm_movelh_ps(x1,z1);
711 t21 = _mm_movehl_ps(z1,x1);
712 t22 = _mm_movelh_ps(t13,t14);
713 t14 = _mm_movehl_ps(t14,t13);
714 t23 = _mm_movelh_ps(y2,x3);
715 t24 = _mm_movehl_ps(x3,y2);
716 t25 = _mm_movelh_ps(t15,t16);
717 t16 = _mm_movehl_ps(t16,t15);
718 t1 = _mm_loadu_ps(ptrA);
719 t2 = _mm_loadu_ps(ptrA+4);
720 t3 = _mm_load_ss(ptrA+8);
721 t1 = _mm_sub_ps(t1,t20);
722 t2 = _mm_sub_ps(t2,t23);
723 t3 = _mm_sub_ss(t3,z3);
724 _mm_storeu_ps(ptrA,t1);
725 _mm_storeu_ps(ptrA+4,t2);
726 _mm_store_ss(ptrA+8,t3);
727 t4 = _mm_loadu_ps(ptrB);
728 t5 = _mm_loadu_ps(ptrB+4);
729 t6 = _mm_load_ss(ptrB+8);
730 t4 = _mm_sub_ps(t4,t21);
731 t5 = _mm_sub_ps(t5,t24);
732 t6 = _mm_sub_ss(t6,t17);
733 _mm_storeu_ps(ptrB,t4);
734 _mm_storeu_ps(ptrB+4,t5);
735 _mm_store_ss(ptrB+8,t6);
736 t7 = _mm_loadu_ps(ptrC);
737 t8 = _mm_loadu_ps(ptrC+4);
738 t9 = _mm_load_ss(ptrC+8);
739 t7 = _mm_sub_ps(t7,t22);
740 t8 = _mm_sub_ps(t8,t25);
741 t9 = _mm_sub_ss(t9,t18);
742 _mm_storeu_ps(ptrC,t7);
743 _mm_storeu_ps(ptrC+4,t8);
744 _mm_store_ss(ptrC+8,t9);
745 t10 = _mm_loadu_ps(ptrD);
746 t11 = _mm_loadu_ps(ptrD+4);
747 t12 = _mm_load_ss(ptrD+8);
748 t10 = _mm_sub_ps(t10,t14);
749 t11 = _mm_sub_ps(t11,t16);
750 t12 = _mm_sub_ss(t12,t19);
751 _mm_storeu_ps(ptrD,t10);
752 _mm_storeu_ps(ptrD+4,t11);
753 _mm_store_ss(ptrD+8,t12);
758 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
759 __m128 x1, __m128 y1, __m128 z1,
760 __m128 x2, __m128 y2, __m128 z2,
761 __m128 x3, __m128 y3, __m128 z3,
762 __m128 x4, __m128 y4, __m128 z4)
764 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
765 __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
767 t13 = _mm_unpackhi_ps(x1,y1);
768 x1 = _mm_unpacklo_ps(x1,y1);
769 t14 = _mm_unpackhi_ps(z1,x2);
770 z1 = _mm_unpacklo_ps(z1,x2);
771 t15 = _mm_unpackhi_ps(y2,z2);
772 y2 = _mm_unpacklo_ps(y2,z2);
773 t16 = _mm_unpackhi_ps(x3,y3);
774 x3 = _mm_unpacklo_ps(x3,y3);
775 t17 = _mm_unpackhi_ps(z3,x4);
776 z3 = _mm_unpacklo_ps(z3,x4);
777 t18 = _mm_unpackhi_ps(y4,z4);
778 y4 = _mm_unpacklo_ps(y4,z4);
779 t19 = _mm_movelh_ps(x1,z1);
780 z1 = _mm_movehl_ps(z1,x1);
781 t20 = _mm_movelh_ps(t13,t14);
782 t14 = _mm_movehl_ps(t14,t13);
783 t21 = _mm_movelh_ps(y2,x3);
784 x3 = _mm_movehl_ps(x3,y2);
785 t22 = _mm_movelh_ps(t15,t16);
786 t16 = _mm_movehl_ps(t16,t15);
787 t23 = _mm_movelh_ps(z3,y4);
788 y4 = _mm_movehl_ps(y4,z3);
789 t24 = _mm_movelh_ps(t17,t18);
790 t18 = _mm_movehl_ps(t18,t17);
791 t1 = _mm_loadu_ps(ptrA);
792 t2 = _mm_loadu_ps(ptrA+4);
793 t3 = _mm_loadu_ps(ptrA+8);
794 t1 = _mm_sub_ps(t1,t19);
795 t2 = _mm_sub_ps(t2,t21);
796 t3 = _mm_sub_ps(t3,t23);
797 _mm_storeu_ps(ptrA,t1);
798 _mm_storeu_ps(ptrA+4,t2);
799 _mm_storeu_ps(ptrA+8,t3);
800 t4 = _mm_loadu_ps(ptrB);
801 t5 = _mm_loadu_ps(ptrB+4);
802 t6 = _mm_loadu_ps(ptrB+8);
803 t4 = _mm_sub_ps(t4,z1);
804 t5 = _mm_sub_ps(t5,x3);
805 t6 = _mm_sub_ps(t6,y4);
806 _mm_storeu_ps(ptrB,t4);
807 _mm_storeu_ps(ptrB+4,t5);
808 _mm_storeu_ps(ptrB+8,t6);
809 t7 = _mm_loadu_ps(ptrC);
810 t8 = _mm_loadu_ps(ptrC+4);
811 t9 = _mm_loadu_ps(ptrC+8);
812 t7 = _mm_sub_ps(t7,t20);
813 t8 = _mm_sub_ps(t8,t22);
814 t9 = _mm_sub_ps(t9,t24);
815 _mm_storeu_ps(ptrC,t7);
816 _mm_storeu_ps(ptrC+4,t8);
817 _mm_storeu_ps(ptrC+8,t9);
818 t10 = _mm_loadu_ps(ptrD);
819 t11 = _mm_loadu_ps(ptrD+4);
820 t12 = _mm_loadu_ps(ptrD+8);
821 t10 = _mm_sub_ps(t10,t14);
822 t11 = _mm_sub_ps(t11,t16);
823 t12 = _mm_sub_ps(t12,t18);
824 _mm_storeu_ps(ptrD,t10);
825 _mm_storeu_ps(ptrD+4,t11);
826 _mm_storeu_ps(ptrD+8,t12);
831 static gmx_inline void
832 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
840 _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
841 fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
843 t2 = _mm_load_ss(fptr);
844 t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
845 t3 = _mm_load_ss(fshiftptr);
846 t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
848 t2 = _mm_add_ps(t2,fix1);
849 t3 = _mm_add_ps(t3,fix1);
851 _mm_store_ss(fptr,t2);
852 _mm_storeh_pi((__m64 *)(fptr+1),t2);
853 _mm_store_ss(fshiftptr,t3);
854 _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
857 static gmx_inline void
858 gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
859 __m128 fix2, __m128 fiy2, __m128 fiz2,
866 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
867 t1 = _mm_unpacklo_ps(fiy2,fiz2);
868 t2 = _mm_unpackhi_ps(fiy2,fiz2);
870 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
871 t1 = _mm_add_ps(t1,t2);
872 t2 = _mm_movehl_ps(t2,t1);
873 fiy2 = _mm_add_ps(t1,t2);
875 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
876 t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
877 _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
879 t4 = _mm_load_ss(fshiftptr+2);
880 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
882 t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
883 t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
884 t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
886 t1 = _mm_add_ps(t1,t2);
887 t1 = _mm_add_ps(t1,t4); /* y x - z */
889 _mm_store_ss(fshiftptr+2,t1);
890 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
895 static gmx_inline void
896 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
897 __m128 fix2, __m128 fiy2, __m128 fiz2,
898 __m128 fix3, __m128 fiy3, __m128 fiz3,
905 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
906 _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
907 t2 = _mm_movehl_ps(_mm_setzero_ps(),fiz3);
908 t1 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
909 t3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
911 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
912 fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
913 fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1) , _mm_add_ps(t2,t3));
915 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
916 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
917 _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
919 t4 = _mm_load_ss(fshiftptr+2);
920 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
922 t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
923 t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
924 t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
925 t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
927 t1 = _mm_add_ps(t1,t2);
928 t3 = _mm_add_ps(t3,t4);
929 t1 = _mm_add_ps(t1,t3); /* y x - z */
931 _mm_store_ss(fshiftptr+2,t1);
932 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
936 static gmx_inline void
937 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
938 __m128 fix2, __m128 fiy2, __m128 fiz2,
939 __m128 fix3, __m128 fiy3, __m128 fiz3,
940 __m128 fix4, __m128 fiy4, __m128 fiz4,
944 __m128 t1,t2,t3,t4,t5;
947 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
948 _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
949 _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
951 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
952 fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
953 fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
955 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
956 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
957 _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
959 t5 = _mm_load_ss(fshiftptr+2);
960 t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
962 t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));
963 t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));
964 t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));
965 t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
966 t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
968 t1 = _mm_add_ps(t1,t2);
969 t3 = _mm_add_ps(t3,t4);
970 t1 = _mm_add_ps(t1,t3);
971 t5 = _mm_add_ps(t5,t1);
973 _mm_store_ss(fshiftptr+2,t5);
974 _mm_storeh_pi((__m64 *)(fshiftptr),t5);
980 gmx_mm_update_1pot_ps(__m128 pot1, float *ptrA)
982 pot1 = _mm_add_ps(pot1,_mm_movehl_ps(_mm_setzero_ps(),pot1));
983 pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
984 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
988 gmx_mm_update_2pot_ps(__m128 pot1, float *ptrA,
989 __m128 pot2, float *ptrB)
992 t1 = _mm_movehl_ps(pot2,pot1);
993 t2 = _mm_movelh_ps(pot1,pot2);
994 t1 = _mm_add_ps(t1,t2);
995 t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
996 pot1 = _mm_add_ps(t1,t2);
997 pot2 = _mm_movehl_ps(t2,pot1);
998 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
999 _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
1004 gmx_mm_update_4pot_ps(__m128 pot1, float *ptrA,
1005 __m128 pot2, float *ptrB,
1006 __m128 pot3, float *ptrC,
1007 __m128 pot4, float *ptrD)
1009 _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
1010 pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
1011 pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
1012 pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
1013 pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
1014 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
1015 _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
1016 _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
1017 _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
1021 #endif /* _kernelutil_x86_sse2_single_h_ */