2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_sse2_single_h_
36 #define _kernelutil_x86_sse2_single_h_
38 /* We require SSE2 now! */
44 #define gmx_mm_castsi128_ps _mm_castsi128_ps
46 #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
49 /* Normal sum of four xmm registers */
50 #define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
52 static gmx_inline __m128 gmx_simdcall
53 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
55 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
58 static gmx_inline int gmx_simdcall
59 gmx_mm_any_lt(__m128 a, __m128 b)
61 return _mm_movemask_ps(_mm_cmplt_ps(a, b));
64 /* Load a single value from 1-4 places, merge into xmm register */
66 static gmx_inline __m128 gmx_simdcall
67 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
68 const float * gmx_restrict ptrB,
69 const float * gmx_restrict ptrC,
70 const float * gmx_restrict ptrD)
74 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
75 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
76 return _mm_unpacklo_ps(t1, t2);
79 static gmx_inline void gmx_simdcall
80 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
81 float * gmx_restrict ptrB,
82 float * gmx_restrict ptrC,
83 float * gmx_restrict ptrD,
88 t3 = _mm_movehl_ps(_mm_setzero_ps(), xmm1);
89 t2 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1, 1, 1, 1));
90 t4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 1, 1, 1));
91 _mm_store_ss(ptrA, xmm1);
92 _mm_store_ss(ptrB, t2);
93 _mm_store_ss(ptrC, t3);
94 _mm_store_ss(ptrD, t4);
97 /* Similar to store, but increments value in memory */
98 static gmx_inline void gmx_simdcall
99 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
100 float * gmx_restrict ptrB,
101 float * gmx_restrict ptrC,
102 float * gmx_restrict ptrD, __m128 xmm1)
106 tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
107 tmp = _mm_add_ps(tmp, xmm1);
108 gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
112 static gmx_inline void gmx_simdcall
113 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
114 const float * gmx_restrict p2,
115 const float * gmx_restrict p3,
116 const float * gmx_restrict p4,
117 __m128 * gmx_restrict c6,
118 __m128 * gmx_restrict c12)
120 __m128 t1, t2, t3, t4;
122 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1); /* - - c12a c6a */
123 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2); /* - - c12b c6b */
124 t3 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3); /* - - c12c c6c */
125 t4 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4); /* - - c12d c6d */
126 t1 = _mm_unpacklo_ps(t1, t2);
127 t2 = _mm_unpacklo_ps(t3, t4);
128 *c6 = _mm_movelh_ps(t1, t2);
129 *c12 = _mm_movehl_ps(t2, t1);
132 /* Routines to load 1-4 rvec from 4 places.
133 * We mainly use these to load coordinates. The extra routines
134 * are very efficient for the water-water loops, since we e.g.
135 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
139 static gmx_inline void gmx_simdcall
140 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
141 const float * gmx_restrict xyz,
142 __m128 * gmx_restrict x1,
143 __m128 * gmx_restrict y1,
144 __m128 * gmx_restrict z1)
146 __m128 t1, t2, t3, t4;
148 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
149 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
150 t3 = _mm_load_ss(xyz_shift+2);
151 t4 = _mm_load_ss(xyz+2);
152 t1 = _mm_add_ps(t1, t2);
153 t3 = _mm_add_ss(t3, t4);
155 *x1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
156 *y1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
157 *z1 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
161 static gmx_inline void gmx_simdcall
162 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
163 const float * gmx_restrict xyz,
164 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
165 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
166 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
169 __m128 t1, t2, t3, t4, t5, t6;
171 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
172 tB = _mm_load_ss(xyz_shift+2);
174 t1 = _mm_loadu_ps(xyz);
175 t2 = _mm_loadu_ps(xyz+4);
176 t3 = _mm_load_ss(xyz+8);
178 tA = _mm_movelh_ps(tA, tB);
179 t4 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(0, 2, 1, 0));
180 t5 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(1, 0, 2, 1));
181 t6 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(2, 1, 0, 2));
183 t1 = _mm_add_ps(t1, t4);
184 t2 = _mm_add_ps(t2, t5);
185 t3 = _mm_add_ss(t3, t6);
187 *x1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
188 *y1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
189 *z1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
190 *x2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
191 *y2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 0));
192 *z2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1, 1, 1, 1));
193 *x3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2, 2, 2, 2));
194 *y3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3, 3, 3, 3));
195 *z3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
199 static gmx_inline void gmx_simdcall
200 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
201 const float * gmx_restrict xyz,
202 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
203 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
204 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
205 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
208 __m128 t1, t2, t3, t4, t5, t6;
210 tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
211 tB = _mm_load_ss(xyz_shift+2);
213 t1 = _mm_loadu_ps(xyz);
214 t2 = _mm_loadu_ps(xyz+4);
215 t3 = _mm_loadu_ps(xyz+8);
217 tA = _mm_movelh_ps(tA, tB);
218 t4 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(0, 2, 1, 0));
219 t5 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(1, 0, 2, 1));
220 t6 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(2, 1, 0, 2));
222 t1 = _mm_add_ps(t1, t4);
223 t2 = _mm_add_ps(t2, t5);
224 t3 = _mm_add_ps(t3, t6);
226 *x1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
227 *y1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
228 *z1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
229 *x2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
230 *y2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 0));
231 *z2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1, 1, 1, 1));
232 *x3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2, 2, 2, 2));
233 *y3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3, 3, 3, 3));
234 *z3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
235 *x4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 1, 1, 1));
236 *y4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2, 2, 2, 2));
237 *z4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3, 3, 3, 3));
241 static gmx_inline void gmx_simdcall
242 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
243 const float * gmx_restrict ptrB,
244 const float * gmx_restrict ptrC,
245 const float * gmx_restrict ptrD,
246 __m128 * gmx_restrict x1,
247 __m128 * gmx_restrict y1,
248 __m128 * gmx_restrict z1)
250 __m128 t1, t2, t3, t4, t5, t6, t7, t8;
251 t1 = _mm_castpd_ps(_mm_load_sd((const double *)ptrA));
252 t2 = _mm_castpd_ps(_mm_load_sd((const double *)ptrB));
253 t3 = _mm_castpd_ps(_mm_load_sd((const double *)ptrC));
254 t4 = _mm_castpd_ps(_mm_load_sd((const double *)ptrD));
255 t5 = _mm_load_ss(ptrA+2);
256 t6 = _mm_load_ss(ptrB+2);
257 t7 = _mm_load_ss(ptrC+2);
258 t8 = _mm_load_ss(ptrD+2);
259 t1 = _mm_unpacklo_ps(t1, t2);
260 t3 = _mm_unpacklo_ps(t3, t4);
261 *x1 = _mm_movelh_ps(t1, t3);
262 *y1 = _mm_movehl_ps(t3, t1);
263 t5 = _mm_unpacklo_ps(t5, t6);
264 t7 = _mm_unpacklo_ps(t7, t8);
265 *z1 = _mm_movelh_ps(t5, t7);
269 static gmx_inline void gmx_simdcall
270 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
271 const float * gmx_restrict ptrB,
272 const float * gmx_restrict ptrC,
273 const float * gmx_restrict ptrD,
274 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
275 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
276 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
278 __m128 t1, t2, t3, t4;
279 t1 = _mm_loadu_ps(ptrA);
280 t2 = _mm_loadu_ps(ptrB);
281 t3 = _mm_loadu_ps(ptrC);
282 t4 = _mm_loadu_ps(ptrD);
283 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
288 t1 = _mm_loadu_ps(ptrA+4);
289 t2 = _mm_loadu_ps(ptrB+4);
290 t3 = _mm_loadu_ps(ptrC+4);
291 t4 = _mm_loadu_ps(ptrD+4);
292 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
297 t1 = _mm_load_ss(ptrA+8);
298 t2 = _mm_load_ss(ptrB+8);
299 t3 = _mm_load_ss(ptrC+8);
300 t4 = _mm_load_ss(ptrD+8);
301 t1 = _mm_unpacklo_ps(t1, t3);
302 t3 = _mm_unpacklo_ps(t2, t4);
303 *z3 = _mm_unpacklo_ps(t1, t3);
307 static gmx_inline void gmx_simdcall
308 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
309 const float * gmx_restrict ptrB,
310 const float * gmx_restrict ptrC,
311 const float * gmx_restrict ptrD,
312 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
313 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
314 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
315 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
317 __m128 t1, t2, t3, t4;
318 t1 = _mm_loadu_ps(ptrA);
319 t2 = _mm_loadu_ps(ptrB);
320 t3 = _mm_loadu_ps(ptrC);
321 t4 = _mm_loadu_ps(ptrD);
322 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
327 t1 = _mm_loadu_ps(ptrA+4);
328 t2 = _mm_loadu_ps(ptrB+4);
329 t3 = _mm_loadu_ps(ptrC+4);
330 t4 = _mm_loadu_ps(ptrD+4);
331 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
336 t1 = _mm_loadu_ps(ptrA+8);
337 t2 = _mm_loadu_ps(ptrB+8);
338 t3 = _mm_loadu_ps(ptrC+8);
339 t4 = _mm_loadu_ps(ptrD+8);
340 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
348 static gmx_inline void gmx_simdcall
349 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
350 float * gmx_restrict ptrB,
351 float * gmx_restrict ptrC,
352 float * gmx_restrict ptrD,
353 __m128 x1, __m128 y1, __m128 z1)
355 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
356 t5 = _mm_unpacklo_ps(y1, z1);
357 t6 = _mm_unpackhi_ps(y1, z1);
358 t7 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
359 t8 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
360 t9 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
361 t10 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
362 t1 = _mm_load_ss(ptrA);
363 t1 = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
364 t1 = _mm_sub_ps(t1, t7);
365 _mm_store_ss(ptrA, t1);
366 _mm_storeh_pi((__m64 *)(ptrA+1), t1);
367 t2 = _mm_load_ss(ptrB);
368 t2 = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
369 t2 = _mm_sub_ps(t2, t8);
370 _mm_store_ss(ptrB, t2);
371 _mm_storeh_pi((__m64 *)(ptrB+1), t2);
372 t3 = _mm_load_ss(ptrC);
373 t3 = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
374 t3 = _mm_sub_ps(t3, t9);
375 _mm_store_ss(ptrC, t3);
376 _mm_storeh_pi((__m64 *)(ptrC+1), t3);
377 t4 = _mm_load_ss(ptrD);
378 t4 = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
379 t4 = _mm_sub_ps(t4, t10);
380 _mm_store_ss(ptrD, t4);
381 _mm_storeh_pi((__m64 *)(ptrD+1), t4);
386 static gmx_inline void gmx_simdcall
387 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
388 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
389 __m128 x1, __m128 y1, __m128 z1,
390 __m128 x2, __m128 y2, __m128 z2,
391 __m128 x3, __m128 y3, __m128 z3)
393 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
394 __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
395 __m128 t20, t21, t22, t23, t24, t25;
397 t13 = _mm_unpackhi_ps(x1, y1);
398 x1 = _mm_unpacklo_ps(x1, y1);
399 t14 = _mm_unpackhi_ps(z1, x2);
400 z1 = _mm_unpacklo_ps(z1, x2);
401 t15 = _mm_unpackhi_ps(y2, z2);
402 y2 = _mm_unpacklo_ps(y2, z2);
403 t16 = _mm_unpackhi_ps(x3, y3);
404 x3 = _mm_unpacklo_ps(x3, y3);
405 t17 = _mm_shuffle_ps(z3, z3, _MM_SHUFFLE(0, 0, 0, 1));
406 t18 = _mm_movehl_ps(z3, z3);
407 t19 = _mm_shuffle_ps(t18, t18, _MM_SHUFFLE(0, 0, 0, 1));
408 t20 = _mm_movelh_ps(x1, z1);
409 t21 = _mm_movehl_ps(z1, x1);
410 t22 = _mm_movelh_ps(t13, t14);
411 t14 = _mm_movehl_ps(t14, t13);
412 t23 = _mm_movelh_ps(y2, x3);
413 t24 = _mm_movehl_ps(x3, y2);
414 t25 = _mm_movelh_ps(t15, t16);
415 t16 = _mm_movehl_ps(t16, t15);
416 t1 = _mm_loadu_ps(ptrA);
417 t2 = _mm_loadu_ps(ptrA+4);
418 t3 = _mm_load_ss(ptrA+8);
419 t1 = _mm_sub_ps(t1, t20);
420 t2 = _mm_sub_ps(t2, t23);
421 t3 = _mm_sub_ss(t3, z3);
422 _mm_storeu_ps(ptrA, t1);
423 _mm_storeu_ps(ptrA+4, t2);
424 _mm_store_ss(ptrA+8, t3);
425 t4 = _mm_loadu_ps(ptrB);
426 t5 = _mm_loadu_ps(ptrB+4);
427 t6 = _mm_load_ss(ptrB+8);
428 t4 = _mm_sub_ps(t4, t21);
429 t5 = _mm_sub_ps(t5, t24);
430 t6 = _mm_sub_ss(t6, t17);
431 _mm_storeu_ps(ptrB, t4);
432 _mm_storeu_ps(ptrB+4, t5);
433 _mm_store_ss(ptrB+8, t6);
434 t7 = _mm_loadu_ps(ptrC);
435 t8 = _mm_loadu_ps(ptrC+4);
436 t9 = _mm_load_ss(ptrC+8);
437 t7 = _mm_sub_ps(t7, t22);
438 t8 = _mm_sub_ps(t8, t25);
439 t9 = _mm_sub_ss(t9, t18);
440 _mm_storeu_ps(ptrC, t7);
441 _mm_storeu_ps(ptrC+4, t8);
442 _mm_store_ss(ptrC+8, t9);
443 t10 = _mm_loadu_ps(ptrD);
444 t11 = _mm_loadu_ps(ptrD+4);
445 t12 = _mm_load_ss(ptrD+8);
446 t10 = _mm_sub_ps(t10, t14);
447 t11 = _mm_sub_ps(t11, t16);
448 t12 = _mm_sub_ss(t12, t19);
449 _mm_storeu_ps(ptrD, t10);
450 _mm_storeu_ps(ptrD+4, t11);
451 _mm_store_ss(ptrD+8, t12);
454 static gmx_inline void gmx_simdcall
455 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
456 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
457 __m128 x1, __m128 y1, __m128 z1,
458 __m128 x2, __m128 y2, __m128 z2,
459 __m128 x3, __m128 y3, __m128 z3,
460 __m128 x4, __m128 y4, __m128 z4)
462 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
463 __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
465 t13 = _mm_unpackhi_ps(x1, y1);
466 x1 = _mm_unpacklo_ps(x1, y1);
467 t14 = _mm_unpackhi_ps(z1, x2);
468 z1 = _mm_unpacklo_ps(z1, x2);
469 t15 = _mm_unpackhi_ps(y2, z2);
470 y2 = _mm_unpacklo_ps(y2, z2);
471 t16 = _mm_unpackhi_ps(x3, y3);
472 x3 = _mm_unpacklo_ps(x3, y3);
473 t17 = _mm_unpackhi_ps(z3, x4);
474 z3 = _mm_unpacklo_ps(z3, x4);
475 t18 = _mm_unpackhi_ps(y4, z4);
476 y4 = _mm_unpacklo_ps(y4, z4);
477 t19 = _mm_movelh_ps(x1, z1);
478 z1 = _mm_movehl_ps(z1, x1);
479 t20 = _mm_movelh_ps(t13, t14);
480 t14 = _mm_movehl_ps(t14, t13);
481 t21 = _mm_movelh_ps(y2, x3);
482 x3 = _mm_movehl_ps(x3, y2);
483 t22 = _mm_movelh_ps(t15, t16);
484 t16 = _mm_movehl_ps(t16, t15);
485 t23 = _mm_movelh_ps(z3, y4);
486 y4 = _mm_movehl_ps(y4, z3);
487 t24 = _mm_movelh_ps(t17, t18);
488 t18 = _mm_movehl_ps(t18, t17);
489 t1 = _mm_loadu_ps(ptrA);
490 t2 = _mm_loadu_ps(ptrA+4);
491 t3 = _mm_loadu_ps(ptrA+8);
492 t1 = _mm_sub_ps(t1, t19);
493 t2 = _mm_sub_ps(t2, t21);
494 t3 = _mm_sub_ps(t3, t23);
495 _mm_storeu_ps(ptrA, t1);
496 _mm_storeu_ps(ptrA+4, t2);
497 _mm_storeu_ps(ptrA+8, t3);
498 t4 = _mm_loadu_ps(ptrB);
499 t5 = _mm_loadu_ps(ptrB+4);
500 t6 = _mm_loadu_ps(ptrB+8);
501 t4 = _mm_sub_ps(t4, z1);
502 t5 = _mm_sub_ps(t5, x3);
503 t6 = _mm_sub_ps(t6, y4);
504 _mm_storeu_ps(ptrB, t4);
505 _mm_storeu_ps(ptrB+4, t5);
506 _mm_storeu_ps(ptrB+8, t6);
507 t7 = _mm_loadu_ps(ptrC);
508 t8 = _mm_loadu_ps(ptrC+4);
509 t9 = _mm_loadu_ps(ptrC+8);
510 t7 = _mm_sub_ps(t7, t20);
511 t8 = _mm_sub_ps(t8, t22);
512 t9 = _mm_sub_ps(t9, t24);
513 _mm_storeu_ps(ptrC, t7);
514 _mm_storeu_ps(ptrC+4, t8);
515 _mm_storeu_ps(ptrC+8, t9);
516 t10 = _mm_loadu_ps(ptrD);
517 t11 = _mm_loadu_ps(ptrD+4);
518 t12 = _mm_loadu_ps(ptrD+8);
519 t10 = _mm_sub_ps(t10, t14);
520 t11 = _mm_sub_ps(t11, t16);
521 t12 = _mm_sub_ps(t12, t18);
522 _mm_storeu_ps(ptrD, t10);
523 _mm_storeu_ps(ptrD+4, t11);
524 _mm_storeu_ps(ptrD+8, t12);
528 static gmx_inline void gmx_simdcall
529 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
530 float * gmx_restrict fptr,
531 float * gmx_restrict fshiftptr)
537 _MM_TRANSPOSE4_PS(fix1, t1, fiy1, fiz1);
538 fix1 = _mm_add_ps(_mm_add_ps(fix1, t1), _mm_add_ps(fiy1, fiz1));
540 t2 = _mm_load_ss(fptr);
541 t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
542 t3 = _mm_load_ss(fshiftptr);
543 t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
545 t2 = _mm_add_ps(t2, fix1);
546 t3 = _mm_add_ps(t3, fix1);
548 _mm_store_ss(fptr, t2);
549 _mm_storeh_pi((__m64 *)(fptr+1), t2);
550 _mm_store_ss(fshiftptr, t3);
551 _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
554 static gmx_inline void gmx_simdcall
555 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
556 __m128 fix2, __m128 fiy2, __m128 fiz2,
557 __m128 fix3, __m128 fiy3, __m128 fiz3,
558 float * gmx_restrict fptr,
559 float * gmx_restrict fshiftptr)
561 __m128 t1, t2, t3, t4;
564 _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2);
565 _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3);
566 t2 = _mm_movehl_ps(_mm_setzero_ps(), fiz3);
567 t1 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(0, 0, 0, 1));
568 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 1));
570 fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2));
571 fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3));
572 fiz3 = _mm_add_ss(_mm_add_ps(fiz3, t1), _mm_add_ps(t2, t3));
574 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
575 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
576 _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));
578 t4 = _mm_load_ss(fshiftptr+2);
579 t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));
581 t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1 - fiz3 */
582 t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3 - fiz1 */
583 t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
584 t3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 2, 0, 0)); /* fiy2 fix2 - fiz2 */
586 t1 = _mm_add_ps(t1, t2);
587 t3 = _mm_add_ps(t3, t4);
588 t1 = _mm_add_ps(t1, t3); /* y x - z */
590 _mm_store_ss(fshiftptr+2, t1);
591 _mm_storeh_pi((__m64 *)(fshiftptr), t1);
594 static gmx_inline void gmx_simdcall
595 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
596 __m128 fix2, __m128 fiy2, __m128 fiz2,
597 __m128 fix3, __m128 fiy3, __m128 fiz3,
598 __m128 fix4, __m128 fiy4, __m128 fiz4,
599 float * gmx_restrict fptr,
600 float * gmx_restrict fshiftptr)
602 __m128 t1, t2, t3, t4, t5;
605 _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2);
606 _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3);
607 _MM_TRANSPOSE4_PS(fiz3, fix4, fiy4, fiz4);
609 fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2));
610 fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3));
611 fiz3 = _mm_add_ps(_mm_add_ps(fiz3, fix4), _mm_add_ps(fiy4, fiz4));
613 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
614 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
615 _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));
617 t5 = _mm_load_ss(fshiftptr+2);
618 t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));
620 t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2));
621 t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1));
622 t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0));
623 t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
624 t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));
626 t1 = _mm_add_ps(t1, t2);
627 t3 = _mm_add_ps(t3, t4);
628 t1 = _mm_add_ps(t1, t3);
629 t5 = _mm_add_ps(t5, t1);
631 _mm_store_ss(fshiftptr+2, t5);
632 _mm_storeh_pi((__m64 *)(fshiftptr), t5);
637 static gmx_inline void gmx_simdcall
638 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
640 pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
641 pot1 = _mm_add_ps(pot1, _mm_shuffle_ps(pot1, pot1, _MM_SHUFFLE(0, 0, 0, 1)));
642 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
645 static gmx_inline void gmx_simdcall
646 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
647 __m128 pot2, float * gmx_restrict ptrB)
650 t1 = _mm_movehl_ps(pot2, pot1);
651 t2 = _mm_movelh_ps(pot1, pot2);
652 t1 = _mm_add_ps(t1, t2);
653 t2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 1, 1));
654 pot1 = _mm_add_ps(t1, t2);
655 pot2 = _mm_movehl_ps(t2, pot1);
656 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
657 _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
661 #endif /* _kernelutil_x86_sse2_single_h_ */