2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_sse4_1_single_h_
36 #define _kernelutil_x86_sse4_1_single_h_
45 #define gmx_mm_castsi128_ps _mm_castsi128_ps
46 #define gmx_mm_extract_epi32 _mm_extract_epi32
48 /* Normal sum of four xmm registers */
49 #define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
51 static gmx_inline __m128 gmx_simdcall
52 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
54 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
57 static gmx_inline int gmx_simdcall
58 gmx_mm_any_lt(__m128 a, __m128 b)
60 return _mm_movemask_ps(_mm_cmplt_ps(a, b));
63 /* Load a single value from 1-4 places, merge into xmm register */
65 static gmx_inline __m128 gmx_simdcall
66 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
67 const float * gmx_restrict ptrB,
68 const float * gmx_restrict ptrC,
69 const float * gmx_restrict ptrD)
73 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
74 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
75 return _mm_unpacklo_ps(t1, t2);
78 static gmx_inline void gmx_simdcall
79 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
80 float * gmx_restrict ptrB,
81 float * gmx_restrict ptrC,
82 float * gmx_restrict ptrD,
87 t3 = _mm_movehl_ps(_mm_setzero_ps(), xmm1);
88 t2 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1, 1, 1, 1));
89 t4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 1, 1, 1));
90 _mm_store_ss(ptrA, xmm1);
91 _mm_store_ss(ptrB, t2);
92 _mm_store_ss(ptrC, t3);
93 _mm_store_ss(ptrD, t4);
96 /* Similar to store, but increments value in memory */
97 static gmx_inline void gmx_simdcall
98 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
99 float * gmx_restrict ptrB,
100 float * gmx_restrict ptrC,
101 float * gmx_restrict ptrD, __m128 xmm1)
105 tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
106 tmp = _mm_add_ps(tmp, xmm1);
107 gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
111 static gmx_inline void gmx_simdcall
112 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
113 const float * gmx_restrict p2,
114 const float * gmx_restrict p3,
115 const float * gmx_restrict p4,
116 __m128 * gmx_restrict c6,
117 __m128 * gmx_restrict c12)
119 __m128 t1, t2, t3, t4;
121 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1); /* - - c12a c6a */
122 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2); /* - - c12b c6b */
123 t3 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3); /* - - c12c c6c */
124 t4 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4); /* - - c12d c6d */
125 t1 = _mm_unpacklo_ps(t1, t2);
126 t2 = _mm_unpacklo_ps(t3, t4);
127 *c6 = _mm_movelh_ps(t1, t2);
128 *c12 = _mm_movehl_ps(t2, t1);
132 static gmx_inline void gmx_simdcall
133 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
134 const float * gmx_restrict xyz,
135 __m128 * gmx_restrict x1,
136 __m128 * gmx_restrict y1,
137 __m128 * gmx_restrict z1)
139 __m128 t1, t2, t3, t4;
141 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
142 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
143 t3 = _mm_load_ss(xyz_shift+2);
144 t4 = _mm_load_ss(xyz+2);
145 t1 = _mm_add_ps(t1, t2);
146 t3 = _mm_add_ss(t3, t4);
148 *x1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
149 *y1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
150 *z1 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
154 static gmx_inline void gmx_simdcall
155 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
156 const float * gmx_restrict xyz,
157 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
158 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
159 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
162 __m128 t1, t2, t3, t4, t5, t6;
164 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
165 tB = _mm_load_ss(xyz_shift+2);
167 t1 = _mm_loadu_ps(xyz);
168 t2 = _mm_loadu_ps(xyz+4);
169 t3 = _mm_load_ss(xyz+8);
171 tA = _mm_movelh_ps(tA, tB);
172 t4 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(0, 2, 1, 0));
173 t5 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(1, 0, 2, 1));
174 t6 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(2, 1, 0, 2));
176 t1 = _mm_add_ps(t1, t4);
177 t2 = _mm_add_ps(t2, t5);
178 t3 = _mm_add_ss(t3, t6);
180 *x1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
181 *y1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
182 *z1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
183 *x2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
184 *y2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 0));
185 *z2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1, 1, 1, 1));
186 *x3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2, 2, 2, 2));
187 *y3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3, 3, 3, 3));
188 *z3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
192 static gmx_inline void gmx_simdcall
193 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
194 const float * gmx_restrict xyz,
195 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
196 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
197 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
198 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
201 __m128 t1, t2, t3, t4, t5, t6;
203 tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
204 tB = _mm_load_ss(xyz_shift+2);
206 t1 = _mm_loadu_ps(xyz);
207 t2 = _mm_loadu_ps(xyz+4);
208 t3 = _mm_loadu_ps(xyz+8);
210 tA = _mm_movelh_ps(tA, tB);
211 t4 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(0, 2, 1, 0));
212 t5 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(1, 0, 2, 1));
213 t6 = _mm_shuffle_ps(tA, tA, _MM_SHUFFLE(2, 1, 0, 2));
215 t1 = _mm_add_ps(t1, t4);
216 t2 = _mm_add_ps(t2, t5);
217 t3 = _mm_add_ps(t3, t6);
219 *x1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
220 *y1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
221 *z1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
222 *x2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
223 *y2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 0));
224 *z2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1, 1, 1, 1));
225 *x3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2, 2, 2, 2));
226 *y3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3, 3, 3, 3));
227 *z3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
228 *x4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 1, 1, 1));
229 *y4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2, 2, 2, 2));
230 *z4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3, 3, 3, 3));
234 static gmx_inline void gmx_simdcall
235 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
236 const float * gmx_restrict ptrB,
237 const float * gmx_restrict ptrC,
238 const float * gmx_restrict ptrD,
239 __m128 * gmx_restrict x1,
240 __m128 * gmx_restrict y1,
241 __m128 * gmx_restrict z1)
243 __m128 t1, t2, t3, t4, t5, t6, t7, t8;
244 t1 = _mm_castpd_ps(_mm_load_sd((const double *)ptrA));
245 t2 = _mm_castpd_ps(_mm_load_sd((const double *)ptrB));
246 t3 = _mm_castpd_ps(_mm_load_sd((const double *)ptrC));
247 t4 = _mm_castpd_ps(_mm_load_sd((const double *)ptrD));
248 t5 = _mm_load_ss(ptrA+2);
249 t6 = _mm_load_ss(ptrB+2);
250 t7 = _mm_load_ss(ptrC+2);
251 t8 = _mm_load_ss(ptrD+2);
252 t1 = _mm_unpacklo_ps(t1, t2);
253 t3 = _mm_unpacklo_ps(t3, t4);
254 *x1 = _mm_movelh_ps(t1, t3);
255 *y1 = _mm_movehl_ps(t3, t1);
256 t5 = _mm_unpacklo_ps(t5, t6);
257 t7 = _mm_unpacklo_ps(t7, t8);
258 *z1 = _mm_movelh_ps(t5, t7);
262 static gmx_inline void gmx_simdcall
263 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
264 const float * gmx_restrict ptrB,
265 const float * gmx_restrict ptrC,
266 const float * gmx_restrict ptrD,
267 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
268 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
269 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
271 __m128 t1, t2, t3, t4;
272 t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrA ) );
273 t2 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrB ) );
274 t3 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrC ) );
275 t4 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)ptrD ) );
276 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
281 t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA+4) ) );
282 t2 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrB+4) ) );
283 t3 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrC+4) ) );
284 t4 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrD+4) ) );
285 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
290 t1 = _mm_load_ss(ptrA+8);
291 t2 = _mm_load_ss(ptrB+8);
292 t3 = _mm_load_ss(ptrC+8);
293 t4 = _mm_load_ss(ptrD+8);
294 t1 = _mm_unpacklo_ps(t1, t3);
295 t3 = _mm_unpacklo_ps(t2, t4);
296 *z3 = _mm_unpacklo_ps(t1, t3);
300 static gmx_inline void gmx_simdcall
301 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
302 const float * gmx_restrict ptrB,
303 const float * gmx_restrict ptrC,
304 const float * gmx_restrict ptrD,
305 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
306 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
307 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
308 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
310 __m128 t1, t2, t3, t4;
311 t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA) ) );
312 t2 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrB) ) );
313 t3 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrC) ) );
314 t4 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrD) ) );
315 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
320 t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA+4) ) );
321 t2 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrB+4) ) );
322 t3 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrC+4) ) );
323 t4 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrD+4) ) );
324 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
329 t1 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrA+8) ) );
330 t2 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrB+8) ) );
331 t3 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrC+8) ) );
332 t4 = gmx_mm_castsi128_ps( _mm_lddqu_si128( (void *)(ptrD+8) ) );
333 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
342 static gmx_inline void gmx_simdcall
343 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
347 __m128 x1, __m128 y1, __m128 z1)
349 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
350 t5 = _mm_unpacklo_ps(y1, z1);
351 t6 = _mm_unpackhi_ps(y1, z1);
352 t7 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
353 t8 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
354 t9 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
355 t10 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
356 t1 = _mm_load_ss(ptrA);
357 t1 = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
358 t1 = _mm_sub_ps(t1, t7);
359 _mm_store_ss(ptrA, t1);
360 _mm_storeh_pi((__m64 *)(ptrA+1), t1);
361 t2 = _mm_load_ss(ptrB);
362 t2 = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
363 t2 = _mm_sub_ps(t2, t8);
364 _mm_store_ss(ptrB, t2);
365 _mm_storeh_pi((__m64 *)(ptrB+1), t2);
366 t3 = _mm_load_ss(ptrC);
367 t3 = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
368 t3 = _mm_sub_ps(t3, t9);
369 _mm_store_ss(ptrC, t3);
370 _mm_storeh_pi((__m64 *)(ptrC+1), t3);
371 t4 = _mm_load_ss(ptrD);
372 t4 = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
373 t4 = _mm_sub_ps(t4, t10);
374 _mm_store_ss(ptrD, t4);
375 _mm_storeh_pi((__m64 *)(ptrD+1), t4);
380 static gmx_inline void gmx_simdcall
381 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
382 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
383 __m128 x1, __m128 y1, __m128 z1,
384 __m128 x2, __m128 y2, __m128 z2,
385 __m128 x3, __m128 y3, __m128 z3)
387 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
388 __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
389 __m128 t20, t21, t22, t23, t24, t25;
391 t13 = _mm_unpackhi_ps(x1, y1);
392 x1 = _mm_unpacklo_ps(x1, y1);
393 t14 = _mm_unpackhi_ps(z1, x2);
394 z1 = _mm_unpacklo_ps(z1, x2);
395 t15 = _mm_unpackhi_ps(y2, z2);
396 y2 = _mm_unpacklo_ps(y2, z2);
397 t16 = _mm_unpackhi_ps(x3, y3);
398 x3 = _mm_unpacklo_ps(x3, y3);
399 t17 = _mm_shuffle_ps(z3, z3, _MM_SHUFFLE(0, 0, 0, 1));
400 t18 = _mm_movehl_ps(z3, z3);
401 t19 = _mm_shuffle_ps(t18, t18, _MM_SHUFFLE(0, 0, 0, 1));
402 t20 = _mm_movelh_ps(x1, z1);
403 t21 = _mm_movehl_ps(z1, x1);
404 t22 = _mm_movelh_ps(t13, t14);
405 t14 = _mm_movehl_ps(t14, t13);
406 t23 = _mm_movelh_ps(y2, x3);
407 t24 = _mm_movehl_ps(x3, y2);
408 t25 = _mm_movelh_ps(t15, t16);
409 t16 = _mm_movehl_ps(t16, t15);
410 t1 = _mm_loadu_ps(ptrA);
411 t2 = _mm_loadu_ps(ptrA+4);
412 t3 = _mm_load_ss(ptrA+8);
413 t4 = _mm_loadu_ps(ptrB);
414 t5 = _mm_loadu_ps(ptrB+4);
415 t6 = _mm_load_ss(ptrB+8);
416 t7 = _mm_loadu_ps(ptrC);
417 t8 = _mm_loadu_ps(ptrC+4);
418 t9 = _mm_load_ss(ptrC+8);
419 t10 = _mm_loadu_ps(ptrD);
420 t11 = _mm_loadu_ps(ptrD+4);
421 t12 = _mm_load_ss(ptrD+8);
423 t1 = _mm_sub_ps(t1, t20);
424 t2 = _mm_sub_ps(t2, t23);
425 t3 = _mm_sub_ss(t3, z3);
426 _mm_storeu_ps(ptrA, t1);
427 _mm_storeu_ps(ptrA+4, t2);
428 _mm_store_ss(ptrA+8, t3);
429 t4 = _mm_sub_ps(t4, t21);
430 t5 = _mm_sub_ps(t5, t24);
431 t6 = _mm_sub_ss(t6, t17);
432 _mm_storeu_ps(ptrB, t4);
433 _mm_storeu_ps(ptrB+4, t5);
434 _mm_store_ss(ptrB+8, t6);
435 t7 = _mm_sub_ps(t7, t22);
436 t8 = _mm_sub_ps(t8, t25);
437 t9 = _mm_sub_ss(t9, t18);
438 _mm_storeu_ps(ptrC, t7);
439 _mm_storeu_ps(ptrC+4, t8);
440 _mm_store_ss(ptrC+8, t9);
441 t10 = _mm_sub_ps(t10, t14);
442 t11 = _mm_sub_ps(t11, t16);
443 t12 = _mm_sub_ss(t12, t19);
444 _mm_storeu_ps(ptrD, t10);
445 _mm_storeu_ps(ptrD+4, t11);
446 _mm_store_ss(ptrD+8, t12);
450 static gmx_inline void gmx_simdcall
451 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
452 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
453 __m128 x1, __m128 y1, __m128 z1,
454 __m128 x2, __m128 y2, __m128 z2,
455 __m128 x3, __m128 y3, __m128 z3,
456 __m128 x4, __m128 y4, __m128 z4)
458 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
459 __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
461 t13 = _mm_unpackhi_ps(x1, y1);
462 x1 = _mm_unpacklo_ps(x1, y1);
463 t14 = _mm_unpackhi_ps(z1, x2);
464 z1 = _mm_unpacklo_ps(z1, x2);
465 t15 = _mm_unpackhi_ps(y2, z2);
466 y2 = _mm_unpacklo_ps(y2, z2);
467 t16 = _mm_unpackhi_ps(x3, y3);
468 x3 = _mm_unpacklo_ps(x3, y3);
469 t17 = _mm_unpackhi_ps(z3, x4);
470 z3 = _mm_unpacklo_ps(z3, x4);
471 t18 = _mm_unpackhi_ps(y4, z4);
472 y4 = _mm_unpacklo_ps(y4, z4);
473 t19 = _mm_movelh_ps(x1, z1);
474 z1 = _mm_movehl_ps(z1, x1);
475 t20 = _mm_movelh_ps(t13, t14);
476 t14 = _mm_movehl_ps(t14, t13);
477 t21 = _mm_movelh_ps(y2, x3);
478 x3 = _mm_movehl_ps(x3, y2);
479 t22 = _mm_movelh_ps(t15, t16);
480 t16 = _mm_movehl_ps(t16, t15);
481 t23 = _mm_movelh_ps(z3, y4);
482 y4 = _mm_movehl_ps(y4, z3);
483 t24 = _mm_movelh_ps(t17, t18);
484 t18 = _mm_movehl_ps(t18, t17);
485 t1 = _mm_loadu_ps(ptrA);
486 t2 = _mm_loadu_ps(ptrA+4);
487 t3 = _mm_loadu_ps(ptrA+8);
488 t1 = _mm_sub_ps(t1, t19);
489 t2 = _mm_sub_ps(t2, t21);
490 t3 = _mm_sub_ps(t3, t23);
491 _mm_storeu_ps(ptrA, t1);
492 _mm_storeu_ps(ptrA+4, t2);
493 _mm_storeu_ps(ptrA+8, t3);
494 t4 = _mm_loadu_ps(ptrB);
495 t5 = _mm_loadu_ps(ptrB+4);
496 t6 = _mm_loadu_ps(ptrB+8);
497 t4 = _mm_sub_ps(t4, z1);
498 t5 = _mm_sub_ps(t5, x3);
499 t6 = _mm_sub_ps(t6, y4);
500 _mm_storeu_ps(ptrB, t4);
501 _mm_storeu_ps(ptrB+4, t5);
502 _mm_storeu_ps(ptrB+8, t6);
503 t7 = _mm_loadu_ps(ptrC);
504 t8 = _mm_loadu_ps(ptrC+4);
505 t9 = _mm_loadu_ps(ptrC+8);
506 t7 = _mm_sub_ps(t7, t20);
507 t8 = _mm_sub_ps(t8, t22);
508 t9 = _mm_sub_ps(t9, t24);
509 _mm_storeu_ps(ptrC, t7);
510 _mm_storeu_ps(ptrC+4, t8);
511 _mm_storeu_ps(ptrC+8, t9);
512 t10 = _mm_loadu_ps(ptrD);
513 t11 = _mm_loadu_ps(ptrD+4);
514 t12 = _mm_loadu_ps(ptrD+8);
515 t10 = _mm_sub_ps(t10, t14);
516 t11 = _mm_sub_ps(t11, t16);
517 t12 = _mm_sub_ps(t12, t18);
518 _mm_storeu_ps(ptrD, t10);
519 _mm_storeu_ps(ptrD+4, t11);
520 _mm_storeu_ps(ptrD+8, t12);
524 static gmx_inline void gmx_simdcall
525 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
526 float * gmx_restrict fptr,
527 float * gmx_restrict fshiftptr)
531 fix1 = _mm_hadd_ps(fix1, fix1);
532 fiy1 = _mm_hadd_ps(fiy1, fiz1);
534 fix1 = _mm_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 */
536 t2 = _mm_load_ss(fptr);
537 t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
538 t3 = _mm_load_ss(fshiftptr);
539 t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
541 t2 = _mm_add_ps(t2, fix1);
542 t3 = _mm_add_ps(t3, fix1);
544 _mm_store_ss(fptr, t2);
545 _mm_storeh_pi((__m64 *)(fptr+1), t2);
546 _mm_store_ss(fshiftptr, t3);
547 _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
551 static gmx_inline void gmx_simdcall
552 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
553 __m128 fix2, __m128 fiy2, __m128 fiz2,
554 __m128 fix3, __m128 fiy3, __m128 fiz3,
555 float * gmx_restrict fptr,
556 float * gmx_restrict fshiftptr)
558 __m128 t1, t2, t3, t4;
560 fix1 = _mm_hadd_ps(fix1, fiy1);
561 fiz1 = _mm_hadd_ps(fiz1, fix2);
562 fiy2 = _mm_hadd_ps(fiy2, fiz2);
563 fix3 = _mm_hadd_ps(fix3, fiy3);
564 fiz3 = _mm_hadd_ps(fiz3, fiz3);
566 fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
567 fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
568 fiz3 = _mm_hadd_ps(fiz3, fiz3); /* - - - fiz3 */
570 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
571 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
572 _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));
574 t4 = _mm_load_ss(fshiftptr+2);
575 t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));
577 t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1 - fiz3 */
578 t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3 - fiz1 */
579 t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
580 t3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 2, 0, 0)); /* fiy2 fix2 - fiz2 */
582 t1 = _mm_add_ps(t1, t2);
583 t3 = _mm_add_ps(t3, t4);
584 t1 = _mm_add_ps(t1, t3); /* y x - z */
586 _mm_store_ss(fshiftptr+2, t1);
587 _mm_storeh_pi((__m64 *)(fshiftptr), t1);
591 static gmx_inline void gmx_simdcall
592 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
593 __m128 fix2, __m128 fiy2, __m128 fiz2,
594 __m128 fix3, __m128 fiy3, __m128 fiz3,
595 __m128 fix4, __m128 fiy4, __m128 fiz4,
596 float * gmx_restrict fptr,
597 float * gmx_restrict fshiftptr)
599 __m128 t1, t2, t3, t4, t5;
601 fix1 = _mm_hadd_ps(fix1, fiy1);
602 fiz1 = _mm_hadd_ps(fiz1, fix2);
603 fiy2 = _mm_hadd_ps(fiy2, fiz2);
604 fix3 = _mm_hadd_ps(fix3, fiy3);
605 fiz3 = _mm_hadd_ps(fiz3, fix4);
606 fiy4 = _mm_hadd_ps(fiy4, fiz4);
608 fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
609 fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
610 fiz3 = _mm_hadd_ps(fiz3, fiy4); /* fiz4 fiy4 fix4 fiz3 */
612 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
613 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
614 _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));
616 t5 = _mm_load_ss(fshiftptr+2);
617 t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));
619 t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2));
620 t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1));
621 t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0));
622 t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
623 t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));
625 t1 = _mm_add_ps(t1, t2);
626 t3 = _mm_add_ps(t3, t4);
627 t1 = _mm_add_ps(t1, t3);
628 t5 = _mm_add_ps(t5, t1);
630 _mm_store_ss(fshiftptr+2, t5);
631 _mm_storeh_pi((__m64 *)(fshiftptr), t5);
635 static gmx_inline void gmx_simdcall
636 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
638 pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
639 pot1 = _mm_add_ps(pot1, _mm_shuffle_ps(pot1, pot1, _MM_SHUFFLE(0, 0, 0, 1)));
640 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
643 static gmx_inline void gmx_simdcall
644 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
645 __m128 pot2, float * gmx_restrict ptrB)
648 t1 = _mm_movehl_ps(pot2, pot1);
649 t2 = _mm_movelh_ps(pot1, pot2);
650 t1 = _mm_add_ps(t1, t2);
651 t2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 1, 1));
652 pot1 = _mm_add_ps(t1, t2);
653 pot2 = _mm_movehl_ps(t2, pot1);
654 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
655 _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
659 #endif /* _kernelutil_x86_sse4_1_single_h_ */