2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_x86_avx_128_fma_single_h_
30 #define _kernelutil_x86_avx_128_fma_single_h_
35 #include "gmx_x86_avx_128_fma.h"
37 /* Normal sum of four xmm registers */
38 #define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
41 gmx_mm_any_lt(__m128 a, __m128 b)
43 return _mm_movemask_ps(_mm_cmplt_ps(a,b));
46 static gmx_inline __m128
47 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
49 return _mm_macc_ps(dx,dx,_mm_macc_ps(dy,dy,_mm_mul_ps(dz,dz)));
52 /* Load a single value from 1-4 places, merge into xmm register */
54 static gmx_inline __m128
55 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
56 const float * gmx_restrict ptrB,
57 const float * gmx_restrict ptrC,
58 const float * gmx_restrict ptrD)
62 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
63 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
64 return _mm_unpacklo_ps(t1,t2);
68 static gmx_inline void
69 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
70 float * gmx_restrict ptrB,
71 float * gmx_restrict ptrC,
72 float * gmx_restrict ptrD, __m128 xmm1)
76 t2 = _mm_permute_ps(xmm1,_MM_SHUFFLE(1,1,1,1));
77 t3 = _mm_permute_ps(xmm1,_MM_SHUFFLE(2,2,2,2));
78 t4 = _mm_permute_ps(xmm1,_MM_SHUFFLE(3,3,3,3));
79 _mm_store_ss(ptrA,xmm1);
80 _mm_store_ss(ptrB,t2);
81 _mm_store_ss(ptrC,t3);
82 _mm_store_ss(ptrD,t4);
86 static gmx_inline void
87 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
88 float * gmx_restrict ptrB,
89 float * gmx_restrict ptrC,
90 float * gmx_restrict ptrD, __m128 xmm1)
94 tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
95 tmp = _mm_add_ps(tmp,xmm1);
96 gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
100 static gmx_inline void
101 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
102 const float * gmx_restrict p2,
103 const float * gmx_restrict p3,
104 const float * gmx_restrict p4,
105 __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
108 t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);
109 t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);
110 t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);
111 t4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4);
112 t1 = _mm_unpacklo_ps(t1,t3);
113 t2 = _mm_unpacklo_ps(t2,t4);
114 *c6 = _mm_unpacklo_ps(t1,t2);
115 *c12 = _mm_unpackhi_ps(t1,t2);
121 static gmx_inline void
122 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
123 const float * gmx_restrict xyz,
124 __m128 * gmx_restrict x1,
125 __m128 * gmx_restrict y1,
126 __m128 * gmx_restrict z1)
130 t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
131 t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
132 t3 = _mm_load_ss(xyz_shift+2);
133 t4 = _mm_load_ss(xyz+2);
134 t1 = _mm_add_ps(t1,t2);
135 t3 = _mm_add_ss(t3,t4);
137 *x1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
138 *y1 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
139 *z1 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
143 static gmx_inline void
144 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
145 const float * gmx_restrict xyz,
146 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
147 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
148 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
151 __m128 t1,t2,t3,t4,t5,t6;
153 tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
154 tB = _mm_load_ss(xyz_shift+2);
156 t1 = _mm_loadu_ps(xyz);
157 t2 = _mm_loadu_ps(xyz+4);
158 t3 = _mm_load_ss(xyz+8);
160 tA = _mm_movelh_ps(tA,tB);
161 t4 = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
162 t5 = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
163 t6 = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
165 t1 = _mm_add_ps(t1,t4);
166 t2 = _mm_add_ps(t2,t5);
167 t3 = _mm_add_ss(t3,t6);
169 *x1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
170 *y1 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
171 *z1 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
172 *x2 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
173 *y2 = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
174 *z2 = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
175 *x3 = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
176 *y3 = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
177 *z3 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
181 static gmx_inline void
182 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
183 const float * gmx_restrict xyz,
184 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
185 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
186 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
187 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
190 __m128 t1,t2,t3,t4,t5,t6;
192 tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
193 tB = _mm_load_ss(xyz_shift+2);
195 t1 = _mm_loadu_ps(xyz);
196 t2 = _mm_loadu_ps(xyz+4);
197 t3 = _mm_loadu_ps(xyz+8);
199 tA = _mm_movelh_ps(tA,tB);
200 t4 = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
201 t5 = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
202 t6 = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
204 t1 = _mm_add_ps(t1,t4);
205 t2 = _mm_add_ps(t2,t5);
206 t3 = _mm_add_ps(t3,t6);
208 *x1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
209 *y1 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
210 *z1 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
211 *x2 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
212 *y2 = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
213 *z2 = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
214 *x3 = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
215 *y3 = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
216 *z3 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
217 *x4 = _mm_permute_ps(t3,_MM_SHUFFLE(1,1,1,1));
218 *y4 = _mm_permute_ps(t3,_MM_SHUFFLE(2,2,2,2));
219 *z4 = _mm_permute_ps(t3,_MM_SHUFFLE(3,3,3,3));
223 static gmx_inline void
224 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
225 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
226 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
229 __m128i mask = _mm_set_epi32(0,-1,-1,-1);
230 t1 = _mm_maskload_ps(ptrA,mask);
231 t2 = _mm_maskload_ps(ptrB,mask);
232 t3 = _mm_maskload_ps(ptrC,mask);
233 t4 = _mm_maskload_ps(ptrD,mask);
234 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
241 static gmx_inline void
242 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
243 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
244 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
245 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
246 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
249 t1 = _mm_loadu_ps(ptrA);
250 t2 = _mm_loadu_ps(ptrB);
251 t3 = _mm_loadu_ps(ptrC);
252 t4 = _mm_loadu_ps(ptrD);
253 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
258 t1 = _mm_loadu_ps(ptrA+4);
259 t2 = _mm_loadu_ps(ptrB+4);
260 t3 = _mm_loadu_ps(ptrC+4);
261 t4 = _mm_loadu_ps(ptrD+4);
262 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
267 t1 = _mm_load_ss(ptrA+8);
268 t2 = _mm_load_ss(ptrB+8);
269 t3 = _mm_load_ss(ptrC+8);
270 t4 = _mm_load_ss(ptrD+8);
271 t1 = _mm_unpacklo_ps(t1,t3);
272 t3 = _mm_unpacklo_ps(t2,t4);
273 *z3 = _mm_unpacklo_ps(t1,t3);
277 static gmx_inline void
278 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
279 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
280 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
281 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
282 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
283 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
286 t1 = _mm_loadu_ps(ptrA);
287 t2 = _mm_loadu_ps(ptrB);
288 t3 = _mm_loadu_ps(ptrC);
289 t4 = _mm_loadu_ps(ptrD);
290 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
295 t1 = _mm_loadu_ps(ptrA+4);
296 t2 = _mm_loadu_ps(ptrB+4);
297 t3 = _mm_loadu_ps(ptrC+4);
298 t4 = _mm_loadu_ps(ptrD+4);
299 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
304 t1 = _mm_loadu_ps(ptrA+8);
305 t2 = _mm_loadu_ps(ptrB+8);
306 t3 = _mm_loadu_ps(ptrC+8);
307 t4 = _mm_loadu_ps(ptrD+8);
308 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
316 static gmx_inline void
317 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
318 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
319 __m128 x1, __m128 y1, __m128 z1)
321 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
322 t5 = _mm_unpacklo_ps(y1,z1);
323 t6 = _mm_unpackhi_ps(y1,z1);
324 t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
325 t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
326 t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
327 t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
328 t1 = _mm_load_ss(ptrA);
329 t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
330 t1 = _mm_sub_ps(t1,t7);
331 _mm_store_ss(ptrA,t1);
332 _mm_storeh_pi((__m64 *)(ptrA+1),t1);
333 t2 = _mm_load_ss(ptrB);
334 t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
335 t2 = _mm_sub_ps(t2,t8);
336 _mm_store_ss(ptrB,t2);
337 _mm_storeh_pi((__m64 *)(ptrB+1),t2);
338 t3 = _mm_load_ss(ptrC);
339 t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
340 t3 = _mm_sub_ps(t3,t9);
341 _mm_store_ss(ptrC,t3);
342 _mm_storeh_pi((__m64 *)(ptrC+1),t3);
343 t4 = _mm_load_ss(ptrD);
344 t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
345 t4 = _mm_sub_ps(t4,t10);
346 _mm_store_ss(ptrD,t4);
347 _mm_storeh_pi((__m64 *)(ptrD+1),t4);
351 static gmx_inline void
352 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
353 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
354 __m128 x1, __m128 y1, __m128 z1,
355 __m128 x2, __m128 y2, __m128 z2,
356 __m128 x3, __m128 y3, __m128 z3)
358 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
359 __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
360 __m128 t20,t21,t22,t23,t24,t25;
361 t13 = _mm_unpackhi_ps(x1,y1);
362 x1 = _mm_unpacklo_ps(x1,y1);
363 t14 = _mm_unpackhi_ps(z1,x2);
364 z1 = _mm_unpacklo_ps(z1,x2);
365 t15 = _mm_unpackhi_ps(y2,z2);
366 y2 = _mm_unpacklo_ps(y2,z2);
367 t16 = _mm_unpackhi_ps(x3,y3);
368 x3 = _mm_unpacklo_ps(x3,y3);
369 t17 = _mm_permute_ps(z3,_MM_SHUFFLE(0,0,0,1));
370 t18 = _mm_movehl_ps(z3,z3);
371 t19 = _mm_permute_ps(t18,_MM_SHUFFLE(0,0,0,1));
372 t20 = _mm_movelh_ps(x1,z1);
373 t21 = _mm_movehl_ps(z1,x1);
374 t22 = _mm_movelh_ps(t13,t14);
375 t14 = _mm_movehl_ps(t14,t13);
376 t23 = _mm_movelh_ps(y2,x3);
377 t24 = _mm_movehl_ps(x3,y2);
378 t25 = _mm_movelh_ps(t15,t16);
379 t16 = _mm_movehl_ps(t16,t15);
380 t1 = _mm_loadu_ps(ptrA);
381 t2 = _mm_loadu_ps(ptrA+4);
382 t3 = _mm_load_ss(ptrA+8);
383 t1 = _mm_sub_ps(t1,t20);
384 t2 = _mm_sub_ps(t2,t23);
385 t3 = _mm_sub_ss(t3,z3);
386 _mm_storeu_ps(ptrA,t1);
387 _mm_storeu_ps(ptrA+4,t2);
388 _mm_store_ss(ptrA+8,t3);
389 t4 = _mm_loadu_ps(ptrB);
390 t5 = _mm_loadu_ps(ptrB+4);
391 t6 = _mm_load_ss(ptrB+8);
392 t4 = _mm_sub_ps(t4,t21);
393 t5 = _mm_sub_ps(t5,t24);
394 t6 = _mm_sub_ss(t6,t17);
395 _mm_storeu_ps(ptrB,t4);
396 _mm_storeu_ps(ptrB+4,t5);
397 _mm_store_ss(ptrB+8,t6);
398 t7 = _mm_loadu_ps(ptrC);
399 t8 = _mm_loadu_ps(ptrC+4);
400 t9 = _mm_load_ss(ptrC+8);
401 t7 = _mm_sub_ps(t7,t22);
402 t8 = _mm_sub_ps(t8,t25);
403 t9 = _mm_sub_ss(t9,t18);
404 _mm_storeu_ps(ptrC,t7);
405 _mm_storeu_ps(ptrC+4,t8);
406 _mm_store_ss(ptrC+8,t9);
407 t10 = _mm_loadu_ps(ptrD);
408 t11 = _mm_loadu_ps(ptrD+4);
409 t12 = _mm_load_ss(ptrD+8);
410 t10 = _mm_sub_ps(t10,t14);
411 t11 = _mm_sub_ps(t11,t16);
412 t12 = _mm_sub_ss(t12,t19);
413 _mm_storeu_ps(ptrD,t10);
414 _mm_storeu_ps(ptrD+4,t11);
415 _mm_store_ss(ptrD+8,t12);
419 static gmx_inline void
420 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
421 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
422 __m128 x1, __m128 y1, __m128 z1,
423 __m128 x2, __m128 y2, __m128 z2,
424 __m128 x3, __m128 y3, __m128 z3,
425 __m128 x4, __m128 y4, __m128 z4)
427 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
428 __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
430 t13 = _mm_unpackhi_ps(x1,y1);
431 x1 = _mm_unpacklo_ps(x1,y1);
432 t14 = _mm_unpackhi_ps(z1,x2);
433 z1 = _mm_unpacklo_ps(z1,x2);
434 t15 = _mm_unpackhi_ps(y2,z2);
435 y2 = _mm_unpacklo_ps(y2,z2);
436 t16 = _mm_unpackhi_ps(x3,y3);
437 x3 = _mm_unpacklo_ps(x3,y3);
438 t17 = _mm_unpackhi_ps(z3,x4);
439 z3 = _mm_unpacklo_ps(z3,x4);
440 t18 = _mm_unpackhi_ps(y4,z4);
441 y4 = _mm_unpacklo_ps(y4,z4);
442 t19 = _mm_movelh_ps(x1,z1);
443 z1 = _mm_movehl_ps(z1,x1);
444 t20 = _mm_movelh_ps(t13,t14);
445 t14 = _mm_movehl_ps(t14,t13);
446 t21 = _mm_movelh_ps(y2,x3);
447 x3 = _mm_movehl_ps(x3,y2);
448 t22 = _mm_movelh_ps(t15,t16);
449 t16 = _mm_movehl_ps(t16,t15);
450 t23 = _mm_movelh_ps(z3,y4);
451 y4 = _mm_movehl_ps(y4,z3);
452 t24 = _mm_movelh_ps(t17,t18);
453 t18 = _mm_movehl_ps(t18,t17);
454 t1 = _mm_loadu_ps(ptrA);
455 t2 = _mm_loadu_ps(ptrA+4);
456 t3 = _mm_loadu_ps(ptrA+8);
457 t1 = _mm_sub_ps(t1,t19);
458 t2 = _mm_sub_ps(t2,t21);
459 t3 = _mm_sub_ps(t3,t23);
460 _mm_storeu_ps(ptrA,t1);
461 _mm_storeu_ps(ptrA+4,t2);
462 _mm_storeu_ps(ptrA+8,t3);
463 t4 = _mm_loadu_ps(ptrB);
464 t5 = _mm_loadu_ps(ptrB+4);
465 t6 = _mm_loadu_ps(ptrB+8);
466 t4 = _mm_sub_ps(t4,z1);
467 t5 = _mm_sub_ps(t5,x3);
468 t6 = _mm_sub_ps(t6,y4);
469 _mm_storeu_ps(ptrB,t4);
470 _mm_storeu_ps(ptrB+4,t5);
471 _mm_storeu_ps(ptrB+8,t6);
472 t7 = _mm_loadu_ps(ptrC);
473 t8 = _mm_loadu_ps(ptrC+4);
474 t9 = _mm_loadu_ps(ptrC+8);
475 t7 = _mm_sub_ps(t7,t20);
476 t8 = _mm_sub_ps(t8,t22);
477 t9 = _mm_sub_ps(t9,t24);
478 _mm_storeu_ps(ptrC,t7);
479 _mm_storeu_ps(ptrC+4,t8);
480 _mm_storeu_ps(ptrC+8,t9);
481 t10 = _mm_loadu_ps(ptrD);
482 t11 = _mm_loadu_ps(ptrD+4);
483 t12 = _mm_loadu_ps(ptrD+8);
484 t10 = _mm_sub_ps(t10,t14);
485 t11 = _mm_sub_ps(t11,t16);
486 t12 = _mm_sub_ps(t12,t18);
487 _mm_storeu_ps(ptrD,t10);
488 _mm_storeu_ps(ptrD+4,t11);
489 _mm_storeu_ps(ptrD+8,t12);
493 static gmx_inline void
494 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
495 float * gmx_restrict fptr,
496 float * gmx_restrict fshiftptr)
500 fix1 = _mm_hadd_ps(fix1,fix1);
501 fiy1 = _mm_hadd_ps(fiy1,fiz1);
503 fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
505 t2 = _mm_load_ss(fptr);
506 t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
507 t3 = _mm_load_ss(fshiftptr);
508 t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
510 t2 = _mm_add_ps(t2,fix1);
511 t3 = _mm_add_ps(t3,fix1);
513 _mm_store_ss(fptr,t2);
514 _mm_storeh_pi((__m64 *)(fptr+1),t2);
515 _mm_store_ss(fshiftptr,t3);
516 _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
519 static gmx_inline void
520 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
521 __m128 fix2, __m128 fiy2, __m128 fiz2,
522 __m128 fix3, __m128 fiy3, __m128 fiz3,
523 float * gmx_restrict fptr,
524 float * gmx_restrict fshiftptr)
528 fix1 = _mm_hadd_ps(fix1,fiy1);
529 fiz1 = _mm_hadd_ps(fiz1,fix2);
530 fiy2 = _mm_hadd_ps(fiy2,fiz2);
531 fix3 = _mm_hadd_ps(fix3,fiy3);
532 fiz3 = _mm_hadd_ps(fiz3,fiz3);
534 fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
535 fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
536 fiz3 = _mm_hadd_ps(fiz3,fiz3); /* - - - fiz3 */
538 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
539 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
540 _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
542 t4 = _mm_load_ss(fshiftptr+2);
543 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
545 t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
546 t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
547 t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
548 t3 = _mm_permute_ps(t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
550 t1 = _mm_add_ps(t1,t2);
551 t3 = _mm_add_ps(t3,t4);
552 t1 = _mm_add_ps(t1,t3); /* y x - z */
554 _mm_store_ss(fshiftptr+2,t1);
555 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
559 static gmx_inline void
560 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
561 __m128 fix2, __m128 fiy2, __m128 fiz2,
562 __m128 fix3, __m128 fiy3, __m128 fiz3,
563 __m128 fix4, __m128 fiy4, __m128 fiz4,
564 float * gmx_restrict fptr,
565 float * gmx_restrict fshiftptr)
567 __m128 t1,t2,t3,t4,t5;
569 fix1 = _mm_hadd_ps(fix1,fiy1);
570 fiz1 = _mm_hadd_ps(fiz1,fix2);
571 fiy2 = _mm_hadd_ps(fiy2,fiz2);
572 fix3 = _mm_hadd_ps(fix3,fiy3);
573 fiz3 = _mm_hadd_ps(fiz3,fix4);
574 fiy4 = _mm_hadd_ps(fiy4,fiz4);
576 fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
577 fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
578 fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
580 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
581 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
582 _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
584 t5 = _mm_load_ss(fshiftptr+2);
585 t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
587 t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));
588 t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));
589 t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));
590 t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
591 t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
593 t1 = _mm_add_ps(t1,t2);
594 t3 = _mm_add_ps(t3,t4);
595 t1 = _mm_add_ps(t1,t3);
596 t5 = _mm_add_ps(t5,t1);
598 _mm_store_ss(fshiftptr+2,t5);
599 _mm_storeh_pi((__m64 *)(fshiftptr),t5);
604 static gmx_inline void
605 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
607 pot1 = _mm_hadd_ps(pot1,pot1);
608 pot1 = _mm_hadd_ps(pot1,pot1);
609 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
612 static gmx_inline void
613 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
614 __m128 pot2, float * gmx_restrict ptrB)
616 pot1 = _mm_hadd_ps(pot1,pot2);
617 pot1 = _mm_hadd_ps(pot1,pot1);
618 pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(0,0,0,1));
619 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
620 _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
624 static gmx_inline void
625 gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
626 __m128 pot2, float * gmx_restrict ptrB,
627 __m128 pot3, float * gmx_restrict ptrC,
628 __m128 pot4, float * gmx_restrict ptrD)
630 _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
631 pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
632 pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(1,1,1,1));
633 pot3 = _mm_permute_ps(pot1,_MM_SHUFFLE(2,2,2,2));
634 pot4 = _mm_permute_ps(pot1,_MM_SHUFFLE(3,3,3,3));
635 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
636 _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
637 _mm_store_ss(ptrC,_mm_add_ss(pot3,_mm_load_ss(ptrC)));
638 _mm_store_ss(ptrD,_mm_add_ss(pot4,_mm_load_ss(ptrD)));
642 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */