2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_x86_avx_128_fma_single_h_
30 #define _kernelutil_x86_avx_128_fma_single_h_
35 #include "gmx_x86_avx_128_fma.h"
37 /* Normal sum of four xmm registers */
38 #define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
41 gmx_mm_any_lt(__m128 a, __m128 b)
43 return _mm_movemask_ps(_mm_cmplt_ps(a,b));
46 static gmx_inline __m128
47 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
49 return _mm_macc_ps(dx,dx,_mm_macc_ps(dy,dy,_mm_mul_ps(dz,dz)));
52 /* Load a single value from 1-4 places, merge into xmm register */
54 static gmx_inline __m128
55 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
56 const float * gmx_restrict ptrB,
57 const float * gmx_restrict ptrC,
58 const float * gmx_restrict ptrD)
62 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
63 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
64 return _mm_unpacklo_ps(t1,t2);
68 static gmx_inline void
69 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
70 float * gmx_restrict ptrB,
71 float * gmx_restrict ptrC,
72 float * gmx_restrict ptrD, __m128 xmm1)
76 t2 = _mm_permute_ps(xmm1,_MM_SHUFFLE(1,1,1,1));
77 t3 = _mm_permute_ps(xmm1,_MM_SHUFFLE(2,2,2,2));
78 t4 = _mm_permute_ps(xmm1,_MM_SHUFFLE(3,3,3,3));
79 _mm_store_ss(ptrA,xmm1);
80 _mm_store_ss(ptrB,t2);
81 _mm_store_ss(ptrC,t3);
82 _mm_store_ss(ptrD,t4);
86 static gmx_inline void
87 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
88 float * gmx_restrict ptrB,
89 float * gmx_restrict ptrC,
90 float * gmx_restrict ptrD, __m128 xmm1)
94 tmp = gmx_mm_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
95 tmp = _mm_add_ps(tmp,xmm1);
96 gmx_mm_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,tmp);
100 static gmx_inline void
101 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
102 const float * gmx_restrict p2,
103 const float * gmx_restrict p3,
104 const float * gmx_restrict p4,
105 __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
108 t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1);
109 t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2);
110 t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3);
111 t4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4);
112 t1 = _mm_unpacklo_ps(t1,t3);
113 t2 = _mm_unpacklo_ps(t2,t4);
114 *c6 = _mm_unpacklo_ps(t1,t2);
115 *c12 = _mm_unpackhi_ps(t1,t2);
121 static gmx_inline void
122 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
123 const float * gmx_restrict xyz,
124 __m128 * gmx_restrict x1,
125 __m128 * gmx_restrict y1,
126 __m128 * gmx_restrict z1)
130 t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
131 t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
132 t3 = _mm_load_ss(xyz_shift+2);
133 t4 = _mm_load_ss(xyz+2);
134 t1 = _mm_add_ps(t1,t2);
135 t3 = _mm_add_ss(t3,t4);
137 *x1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
138 *y1 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
139 *z1 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
143 static gmx_inline void
144 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
145 const float * gmx_restrict xyz,
146 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
147 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
148 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
151 __m128 t1,t2,t3,t4,t5,t6;
153 tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
154 tB = _mm_load_ss(xyz_shift+2);
156 t1 = _mm_loadu_ps(xyz);
157 t2 = _mm_loadu_ps(xyz+4);
158 t3 = _mm_load_ss(xyz+8);
160 tA = _mm_movelh_ps(tA,tB);
161 t4 = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
162 t5 = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
163 t6 = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
165 t1 = _mm_add_ps(t1,t4);
166 t2 = _mm_add_ps(t2,t5);
167 t3 = _mm_add_ss(t3,t6);
169 *x1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
170 *y1 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
171 *z1 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
172 *x2 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
173 *y2 = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
174 *z2 = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
175 *x3 = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
176 *y3 = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
177 *z3 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
181 static gmx_inline void
182 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
183 const float * gmx_restrict xyz,
184 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
185 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
186 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
187 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
190 __m128 t1,t2,t3,t4,t5,t6;
192 tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
193 tB = _mm_load_ss(xyz_shift+2);
195 t1 = _mm_loadu_ps(xyz);
196 t2 = _mm_loadu_ps(xyz+4);
197 t3 = _mm_loadu_ps(xyz+8);
199 tA = _mm_movelh_ps(tA,tB);
200 t4 = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
201 t5 = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
202 t6 = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
204 t1 = _mm_add_ps(t1,t4);
205 t2 = _mm_add_ps(t2,t5);
206 t3 = _mm_add_ps(t3,t6);
208 *x1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
209 *y1 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
210 *z1 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
211 *x2 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
212 *y2 = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
213 *z2 = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
214 *x3 = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
215 *y3 = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
216 *z3 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
217 *x4 = _mm_permute_ps(t3,_MM_SHUFFLE(1,1,1,1));
218 *y4 = _mm_permute_ps(t3,_MM_SHUFFLE(2,2,2,2));
219 *z4 = _mm_permute_ps(t3,_MM_SHUFFLE(3,3,3,3));
223 static gmx_inline void
224 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
225 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
226 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
229 __m128i mask = _mm_set_epi32(0,-1,-1,-1);
230 t1 = gmx_mm_maskload_ps(ptrA,mask);
231 t2 = gmx_mm_maskload_ps(ptrB,mask);
232 t3 = gmx_mm_maskload_ps(ptrC,mask);
233 t4 = gmx_mm_maskload_ps(ptrD,mask);
234 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
241 static gmx_inline void
242 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
243 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
244 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
245 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
246 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
249 t1 = _mm_loadu_ps(ptrA);
250 t2 = _mm_loadu_ps(ptrB);
251 t3 = _mm_loadu_ps(ptrC);
252 t4 = _mm_loadu_ps(ptrD);
253 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
258 t1 = _mm_loadu_ps(ptrA+4);
259 t2 = _mm_loadu_ps(ptrB+4);
260 t3 = _mm_loadu_ps(ptrC+4);
261 t4 = _mm_loadu_ps(ptrD+4);
262 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
267 t1 = _mm_load_ss(ptrA+8);
268 t2 = _mm_load_ss(ptrB+8);
269 t3 = _mm_load_ss(ptrC+8);
270 t4 = _mm_load_ss(ptrD+8);
271 t1 = _mm_unpacklo_ps(t1,t3);
272 t3 = _mm_unpacklo_ps(t2,t4);
273 *z3 = _mm_unpacklo_ps(t1,t3);
277 static gmx_inline void
278 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
279 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
280 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
281 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
282 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
283 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
286 t1 = _mm_loadu_ps(ptrA);
287 t2 = _mm_loadu_ps(ptrB);
288 t3 = _mm_loadu_ps(ptrC);
289 t4 = _mm_loadu_ps(ptrD);
290 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
295 t1 = _mm_loadu_ps(ptrA+4);
296 t2 = _mm_loadu_ps(ptrB+4);
297 t3 = _mm_loadu_ps(ptrC+4);
298 t4 = _mm_loadu_ps(ptrD+4);
299 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
304 t1 = _mm_loadu_ps(ptrA+8);
305 t2 = _mm_loadu_ps(ptrB+8);
306 t3 = _mm_loadu_ps(ptrC+8);
307 t4 = _mm_loadu_ps(ptrD+8);
308 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
316 static gmx_inline void
317 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
318 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
319 __m128 x1, __m128 y1, __m128 z1)
321 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
322 t5 = _mm_unpacklo_ps(y1,z1);
323 t6 = _mm_unpackhi_ps(y1,z1);
324 t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
325 t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
326 t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
327 t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
328 t1 = _mm_load_ss(ptrA);
329 t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
330 t1 = _mm_sub_ps(t1,t7);
331 _mm_store_ss(ptrA,t1);
332 _mm_storeh_pi((__m64 *)(ptrA+1),t1);
333 t2 = _mm_load_ss(ptrB);
334 t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
335 t2 = _mm_sub_ps(t2,t8);
336 _mm_store_ss(ptrB,t2);
337 _mm_storeh_pi((__m64 *)(ptrB+1),t2);
338 t3 = _mm_load_ss(ptrC);
339 t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
340 t3 = _mm_sub_ps(t3,t9);
341 _mm_store_ss(ptrC,t3);
342 _mm_storeh_pi((__m64 *)(ptrC+1),t3);
343 t4 = _mm_load_ss(ptrD);
344 t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
345 t4 = _mm_sub_ps(t4,t10);
346 _mm_store_ss(ptrD,t4);
347 _mm_storeh_pi((__m64 *)(ptrD+1),t4);
351 #if defined (_MSC_VER) && defined(_M_IX86)
352 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
353 #define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
354 _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3) \
356 __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10;\
357 __m128 _t11,_t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19;\
358 __m128 _t20,_t21,_t22,_t23,_t24,_t25;\
359 _t13 = _mm_unpackhi_ps(_x1,_y1);\
360 _x1 = _mm_unpacklo_ps(_x1,_y1);\
361 _t14 = _mm_unpackhi_ps(_z1,_x2);\
362 _z1 = _mm_unpacklo_ps(_z1,_x2);\
363 _t15 = _mm_unpackhi_ps(_y2,_z2);\
364 _y2 = _mm_unpacklo_ps(_y2,_z2);\
365 _t16 = _mm_unpackhi_ps(_x3,_y3);\
366 _x3 = _mm_unpacklo_ps(_x3,_y3);\
367 _t17 = _mm_permute_ps(_z3,_MM_SHUFFLE(0,0,0,1));\
368 _t18 = _mm_movehl_ps(_z3,_z3);\
369 _t19 = _mm_permute_ps(_t18,_MM_SHUFFLE(0,0,0,1));\
370 _t20 = _mm_movelh_ps(_x1,_z1);\
371 _t21 = _mm_movehl_ps(_z1,_x1);\
372 _t22 = _mm_movelh_ps(_t13,_t14);\
373 _t14 = _mm_movehl_ps(_t14,_t13);\
374 _t23 = _mm_movelh_ps(_y2,_x3);\
375 _t24 = _mm_movehl_ps(_x3,_y2);\
376 _t25 = _mm_movelh_ps(_t15,_t16);\
377 _t16 = _mm_movehl_ps(_t16,_t15);\
378 _t1 = _mm_loadu_ps(ptrA);\
379 _t2 = _mm_loadu_ps(ptrA+4);\
380 _t3 = _mm_load_ss(ptrA+8);\
381 _t1 = _mm_sub_ps(_t1,_t20);\
382 _t2 = _mm_sub_ps(_t2,_t23);\
383 _t3 = _mm_sub_ss(_t3,_z3);\
384 _mm_storeu_ps(ptrA,_t1);\
385 _mm_storeu_ps(ptrA+4,_t2);\
386 _mm_store_ss(ptrA+8,_t3);\
387 _t4 = _mm_loadu_ps(ptrB);\
388 _t5 = _mm_loadu_ps(ptrB+4);\
389 _t6 = _mm_load_ss(ptrB+8);\
390 _t4 = _mm_sub_ps(_t4,_t21);\
391 _t5 = _mm_sub_ps(_t5,_t24);\
392 _t6 = _mm_sub_ss(_t6,_t17);\
393 _mm_storeu_ps(ptrB,_t4);\
394 _mm_storeu_ps(ptrB+4,_t5);\
395 _mm_store_ss(ptrB+8,_t6);\
396 _t7 = _mm_loadu_ps(ptrC);\
397 _t8 = _mm_loadu_ps(ptrC+4);\
398 _t9 = _mm_load_ss(ptrC+8);\
399 _t7 = _mm_sub_ps(_t7,_t22);\
400 _t8 = _mm_sub_ps(_t8,_t25);\
401 _t9 = _mm_sub_ss(_t9,_t18);\
402 _mm_storeu_ps(ptrC,_t7);\
403 _mm_storeu_ps(ptrC+4,_t8);\
404 _mm_store_ss(ptrC+8,_t9);\
405 _t10 = _mm_loadu_ps(ptrD);\
406 _t11 = _mm_loadu_ps(ptrD+4);\
407 _t12 = _mm_load_ss(ptrD+8);\
408 _t10 = _mm_sub_ps(_t10,_t14);\
409 _t11 = _mm_sub_ps(_t11,_t16);\
410 _t12 = _mm_sub_ss(_t12,_t19);\
411 _mm_storeu_ps(ptrD,_t10);\
412 _mm_storeu_ps(ptrD+4,_t11);\
413 _mm_store_ss(ptrD+8,_t12);\
416 /* Real function for sane compilers */
417 static gmx_inline void
418 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
419 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
420 __m128 x1, __m128 y1, __m128 z1,
421 __m128 x2, __m128 y2, __m128 z2,
422 __m128 x3, __m128 y3, __m128 z3)
424 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
425 __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
426 __m128 t20,t21,t22,t23,t24,t25;
427 t13 = _mm_unpackhi_ps(x1,y1);
428 x1 = _mm_unpacklo_ps(x1,y1);
429 t14 = _mm_unpackhi_ps(z1,x2);
430 z1 = _mm_unpacklo_ps(z1,x2);
431 t15 = _mm_unpackhi_ps(y2,z2);
432 y2 = _mm_unpacklo_ps(y2,z2);
433 t16 = _mm_unpackhi_ps(x3,y3);
434 x3 = _mm_unpacklo_ps(x3,y3);
435 t17 = _mm_permute_ps(z3,_MM_SHUFFLE(0,0,0,1));
436 t18 = _mm_movehl_ps(z3,z3);
437 t19 = _mm_permute_ps(t18,_MM_SHUFFLE(0,0,0,1));
438 t20 = _mm_movelh_ps(x1,z1);
439 t21 = _mm_movehl_ps(z1,x1);
440 t22 = _mm_movelh_ps(t13,t14);
441 t14 = _mm_movehl_ps(t14,t13);
442 t23 = _mm_movelh_ps(y2,x3);
443 t24 = _mm_movehl_ps(x3,y2);
444 t25 = _mm_movelh_ps(t15,t16);
445 t16 = _mm_movehl_ps(t16,t15);
446 t1 = _mm_loadu_ps(ptrA);
447 t2 = _mm_loadu_ps(ptrA+4);
448 t3 = _mm_load_ss(ptrA+8);
449 t1 = _mm_sub_ps(t1,t20);
450 t2 = _mm_sub_ps(t2,t23);
451 t3 = _mm_sub_ss(t3,z3);
452 _mm_storeu_ps(ptrA,t1);
453 _mm_storeu_ps(ptrA+4,t2);
454 _mm_store_ss(ptrA+8,t3);
455 t4 = _mm_loadu_ps(ptrB);
456 t5 = _mm_loadu_ps(ptrB+4);
457 t6 = _mm_load_ss(ptrB+8);
458 t4 = _mm_sub_ps(t4,t21);
459 t5 = _mm_sub_ps(t5,t24);
460 t6 = _mm_sub_ss(t6,t17);
461 _mm_storeu_ps(ptrB,t4);
462 _mm_storeu_ps(ptrB+4,t5);
463 _mm_store_ss(ptrB+8,t6);
464 t7 = _mm_loadu_ps(ptrC);
465 t8 = _mm_loadu_ps(ptrC+4);
466 t9 = _mm_load_ss(ptrC+8);
467 t7 = _mm_sub_ps(t7,t22);
468 t8 = _mm_sub_ps(t8,t25);
469 t9 = _mm_sub_ss(t9,t18);
470 _mm_storeu_ps(ptrC,t7);
471 _mm_storeu_ps(ptrC+4,t8);
472 _mm_store_ss(ptrC+8,t9);
473 t10 = _mm_loadu_ps(ptrD);
474 t11 = _mm_loadu_ps(ptrD+4);
475 t12 = _mm_load_ss(ptrD+8);
476 t10 = _mm_sub_ps(t10,t14);
477 t11 = _mm_sub_ps(t11,t16);
478 t12 = _mm_sub_ss(t12,t19);
479 _mm_storeu_ps(ptrD,t10);
480 _mm_storeu_ps(ptrD+4,t11);
481 _mm_store_ss(ptrD+8,t12);
485 #if defined (_MSC_VER) && defined(_M_IX86)
486 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
487 #define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA,ptrB,ptrC,ptrD, \
488 _x1,_y1,_z1,_x2,_y2,_z2,_x3,_y3,_z3,_x4,_y4,_z4) \
490 __m128 _t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11;\
491 __m128 _t12,_t13,_t14,_t15,_t16,_t17,_t18,_t19,_t20,_t21,_t22;\
493 _t13 = _mm_unpackhi_ps(_x1,_y1);\
494 _x1 = _mm_unpacklo_ps(_x1,_y1);\
495 _t14 = _mm_unpackhi_ps(_z1,_x2);\
496 _z1 = _mm_unpacklo_ps(_z1,_x2);\
497 _t15 = _mm_unpackhi_ps(_y2,_z2);\
498 _y2 = _mm_unpacklo_ps(_y2,_z2);\
499 _t16 = _mm_unpackhi_ps(_x3,_y3);\
500 _x3 = _mm_unpacklo_ps(_x3,_y3);\
501 _t17 = _mm_unpackhi_ps(_z3,_x4);\
502 _z3 = _mm_unpacklo_ps(_z3,_x4);\
503 _t18 = _mm_unpackhi_ps(_y4,_z4);\
504 _y4 = _mm_unpacklo_ps(_y4,_z4);\
505 _t19 = _mm_movelh_ps(_x1,_z1);\
506 _z1 = _mm_movehl_ps(_z1,_x1);\
507 _t20 = _mm_movelh_ps(_t13,_t14);\
508 _t14 = _mm_movehl_ps(_t14,_t13);\
509 _t21 = _mm_movelh_ps(_y2,_x3);\
510 _x3 = _mm_movehl_ps(_x3,_y2);\
511 _t22 = _mm_movelh_ps(_t15,_t16);\
512 _t16 = _mm_movehl_ps(_t16,_t15);\
513 _t23 = _mm_movelh_ps(_z3,_y4);\
514 _y4 = _mm_movehl_ps(_y4,_z3);\
515 _t24 = _mm_movelh_ps(_t17,_t18);\
516 _t18 = _mm_movehl_ps(_t18,_t17);\
517 _t1 = _mm_loadu_ps(ptrA);\
518 _t2 = _mm_loadu_ps(ptrA+4);\
519 _t3 = _mm_loadu_ps(ptrA+8);\
520 _t1 = _mm_sub_ps(_t1,_t19);\
521 _t2 = _mm_sub_ps(_t2,_t21);\
522 _t3 = _mm_sub_ps(_t3,_t23);\
523 _mm_storeu_ps(ptrA,_t1);\
524 _mm_storeu_ps(ptrA+4,_t2);\
525 _mm_storeu_ps(ptrA+8,_t3);\
526 _t4 = _mm_loadu_ps(ptrB);\
527 _t5 = _mm_loadu_ps(ptrB+4);\
528 _t6 = _mm_loadu_ps(ptrB+8);\
529 _t4 = _mm_sub_ps(_t4,_z1);\
530 _t5 = _mm_sub_ps(_t5,_x3);\
531 _t6 = _mm_sub_ps(_t6,_y4);\
532 _mm_storeu_ps(ptrB,_t4);\
533 _mm_storeu_ps(ptrB+4,_t5);\
534 _mm_storeu_ps(ptrB+8,_t6);\
535 _t7 = _mm_loadu_ps(ptrC);\
536 _t8 = _mm_loadu_ps(ptrC+4);\
537 _t9 = _mm_loadu_ps(ptrC+8);\
538 _t7 = _mm_sub_ps(_t7,_t20);\
539 _t8 = _mm_sub_ps(_t8,_t22);\
540 _t9 = _mm_sub_ps(_t9,_t24);\
541 _mm_storeu_ps(ptrC,_t7);\
542 _mm_storeu_ps(ptrC+4,_t8);\
543 _mm_storeu_ps(ptrC+8,_t9);\
544 _t10 = _mm_loadu_ps(ptrD);\
545 _t11 = _mm_loadu_ps(ptrD+4);\
546 _t12 = _mm_loadu_ps(ptrD+8);\
547 _t10 = _mm_sub_ps(_t10,_t14);\
548 _t11 = _mm_sub_ps(_t11,_t16);\
549 _t12 = _mm_sub_ps(_t12,_t18);\
550 _mm_storeu_ps(ptrD,_t10);\
551 _mm_storeu_ps(ptrD+4,_t11);\
552 _mm_storeu_ps(ptrD+8,_t12);\
555 /* Real function for sane compilers */
556 static gmx_inline void
557 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
558 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
559 __m128 x1, __m128 y1, __m128 z1,
560 __m128 x2, __m128 y2, __m128 z2,
561 __m128 x3, __m128 y3, __m128 z3,
562 __m128 x4, __m128 y4, __m128 z4)
564 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
565 __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
567 t13 = _mm_unpackhi_ps(x1,y1);
568 x1 = _mm_unpacklo_ps(x1,y1);
569 t14 = _mm_unpackhi_ps(z1,x2);
570 z1 = _mm_unpacklo_ps(z1,x2);
571 t15 = _mm_unpackhi_ps(y2,z2);
572 y2 = _mm_unpacklo_ps(y2,z2);
573 t16 = _mm_unpackhi_ps(x3,y3);
574 x3 = _mm_unpacklo_ps(x3,y3);
575 t17 = _mm_unpackhi_ps(z3,x4);
576 z3 = _mm_unpacklo_ps(z3,x4);
577 t18 = _mm_unpackhi_ps(y4,z4);
578 y4 = _mm_unpacklo_ps(y4,z4);
579 t19 = _mm_movelh_ps(x1,z1);
580 z1 = _mm_movehl_ps(z1,x1);
581 t20 = _mm_movelh_ps(t13,t14);
582 t14 = _mm_movehl_ps(t14,t13);
583 t21 = _mm_movelh_ps(y2,x3);
584 x3 = _mm_movehl_ps(x3,y2);
585 t22 = _mm_movelh_ps(t15,t16);
586 t16 = _mm_movehl_ps(t16,t15);
587 t23 = _mm_movelh_ps(z3,y4);
588 y4 = _mm_movehl_ps(y4,z3);
589 t24 = _mm_movelh_ps(t17,t18);
590 t18 = _mm_movehl_ps(t18,t17);
591 t1 = _mm_loadu_ps(ptrA);
592 t2 = _mm_loadu_ps(ptrA+4);
593 t3 = _mm_loadu_ps(ptrA+8);
594 t1 = _mm_sub_ps(t1,t19);
595 t2 = _mm_sub_ps(t2,t21);
596 t3 = _mm_sub_ps(t3,t23);
597 _mm_storeu_ps(ptrA,t1);
598 _mm_storeu_ps(ptrA+4,t2);
599 _mm_storeu_ps(ptrA+8,t3);
600 t4 = _mm_loadu_ps(ptrB);
601 t5 = _mm_loadu_ps(ptrB+4);
602 t6 = _mm_loadu_ps(ptrB+8);
603 t4 = _mm_sub_ps(t4,z1);
604 t5 = _mm_sub_ps(t5,x3);
605 t6 = _mm_sub_ps(t6,y4);
606 _mm_storeu_ps(ptrB,t4);
607 _mm_storeu_ps(ptrB+4,t5);
608 _mm_storeu_ps(ptrB+8,t6);
609 t7 = _mm_loadu_ps(ptrC);
610 t8 = _mm_loadu_ps(ptrC+4);
611 t9 = _mm_loadu_ps(ptrC+8);
612 t7 = _mm_sub_ps(t7,t20);
613 t8 = _mm_sub_ps(t8,t22);
614 t9 = _mm_sub_ps(t9,t24);
615 _mm_storeu_ps(ptrC,t7);
616 _mm_storeu_ps(ptrC+4,t8);
617 _mm_storeu_ps(ptrC+8,t9);
618 t10 = _mm_loadu_ps(ptrD);
619 t11 = _mm_loadu_ps(ptrD+4);
620 t12 = _mm_loadu_ps(ptrD+8);
621 t10 = _mm_sub_ps(t10,t14);
622 t11 = _mm_sub_ps(t11,t16);
623 t12 = _mm_sub_ps(t12,t18);
624 _mm_storeu_ps(ptrD,t10);
625 _mm_storeu_ps(ptrD+4,t11);
626 _mm_storeu_ps(ptrD+8,t12);
630 static gmx_inline void
631 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
632 float * gmx_restrict fptr,
633 float * gmx_restrict fshiftptr)
637 fix1 = _mm_hadd_ps(fix1,fix1);
638 fiy1 = _mm_hadd_ps(fiy1,fiz1);
640 fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
642 t2 = _mm_load_ss(fptr);
643 t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
644 t3 = _mm_load_ss(fshiftptr);
645 t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
647 t2 = _mm_add_ps(t2,fix1);
648 t3 = _mm_add_ps(t3,fix1);
650 _mm_store_ss(fptr,t2);
651 _mm_storeh_pi((__m64 *)(fptr+1),t2);
652 _mm_store_ss(fshiftptr,t3);
653 _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
656 #if defined (_MSC_VER) && defined(_M_IX86)
657 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
658 #define gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3, \
661 __m128 _t1,_t2,_t3,_t4;\
663 fix1 = _mm_hadd_ps(fix1,fiy1);\
664 fiz1 = _mm_hadd_ps(fiz1,fix2);\
665 fiy2 = _mm_hadd_ps(fiy2,fiz2);\
666 fix3 = _mm_hadd_ps(fix3,fiy3);\
667 fiz3 = _mm_hadd_ps(fiz3,fiz3);\
668 fix1 = _mm_hadd_ps(fix1,fiz1);\
669 fiy2 = _mm_hadd_ps(fiy2,fix3);\
670 fiz3 = _mm_hadd_ps(fiz3,fiz3);\
671 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
672 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
673 _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));\
674 _t4 = _mm_load_ss(fshiftptr+2);\
675 _t4 = _mm_loadh_pi(_t4,(__m64 *)(fshiftptr));\
676 _t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));\
677 _t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));\
678 _t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));\
679 _t3 = _mm_permute_ps(_t3 ,_MM_SHUFFLE(1,2,0,0));\
680 _t1 = _mm_add_ps(_t1,_t2);\
681 _t3 = _mm_add_ps(_t3,_t4);\
682 _t1 = _mm_add_ps(_t1,_t3);\
683 _mm_store_ss(fshiftptr+2,_t1);\
684 _mm_storeh_pi((__m64 *)(fshiftptr),_t1);\
687 /* Real function for sane compilers */
688 static gmx_inline void
689 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
690 __m128 fix2, __m128 fiy2, __m128 fiz2,
691 __m128 fix3, __m128 fiy3, __m128 fiz3,
692 float * gmx_restrict fptr,
693 float * gmx_restrict fshiftptr)
697 fix1 = _mm_hadd_ps(fix1,fiy1);
698 fiz1 = _mm_hadd_ps(fiz1,fix2);
699 fiy2 = _mm_hadd_ps(fiy2,fiz2);
700 fix3 = _mm_hadd_ps(fix3,fiy3);
701 fiz3 = _mm_hadd_ps(fiz3,fiz3);
703 fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
704 fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
705 fiz3 = _mm_hadd_ps(fiz3,fiz3); /* - - - fiz3 */
707 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
708 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
709 _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
711 t4 = _mm_load_ss(fshiftptr+2);
712 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
714 t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
715 t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
716 t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
717 t3 = _mm_permute_ps(t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
719 t1 = _mm_add_ps(t1,t2);
720 t3 = _mm_add_ps(t3,t4);
721 t1 = _mm_add_ps(t1,t3); /* y x - z */
723 _mm_store_ss(fshiftptr+2,t1);
724 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
728 #if defined (_MSC_VER) && defined(_M_IX86)
729 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
730 #define gmx_mm_update_iforce_4atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,fix4,fiy4,fiz4, \
733 __m128 _t1,_t2,_t3,_t4,_t5;\
735 fix1 = _mm_hadd_ps(fix1,fiy1);\
736 fiz1 = _mm_hadd_ps(fiz1,fix2);\
737 fiy2 = _mm_hadd_ps(fiy2,fiz2);\
738 fix3 = _mm_hadd_ps(fix3,fiy3);\
739 fiz3 = _mm_hadd_ps(fiz3,fix4);\
740 fiy4 = _mm_hadd_ps(fiy4,fiz4);\
741 fix1 = _mm_hadd_ps(fix1,fiz1);\
742 fiy2 = _mm_hadd_ps(fiy2,fix3);\
743 fiz3 = _mm_hadd_ps(fiz3,fiy4);\
744 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));\
745 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));\
746 _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));\
747 _t5 = _mm_load_ss(fshiftptr+2);\
748 _t5 = _mm_loadh_pi(_t5,(__m64 *)(fshiftptr));\
749 _t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));\
750 _t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));\
751 _t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));\
752 _t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));\
753 _t4 = _mm_shuffle_ps(fiz3,_t4 ,_MM_SHUFFLE(2,0,3,3));\
754 _t1 = _mm_add_ps(_t1,_t2);\
755 _t3 = _mm_add_ps(_t3,_t4);\
756 _t1 = _mm_add_ps(_t1,_t3);\
757 _t5 = _mm_add_ps(_t5,_t1);\
758 _mm_store_ss(fshiftptr+2,_t5);\
759 _mm_storeh_pi((__m64 *)(fshiftptr),_t5);\
762 /* Real function for sane compilers */
763 static gmx_inline void
764 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
765 __m128 fix2, __m128 fiy2, __m128 fiz2,
766 __m128 fix3, __m128 fiy3, __m128 fiz3,
767 __m128 fix4, __m128 fiy4, __m128 fiz4,
768 float * gmx_restrict fptr,
769 float * gmx_restrict fshiftptr)
771 __m128 t1,t2,t3,t4,t5;
773 fix1 = _mm_hadd_ps(fix1,fiy1);
774 fiz1 = _mm_hadd_ps(fiz1,fix2);
775 fiy2 = _mm_hadd_ps(fiy2,fiz2);
776 fix3 = _mm_hadd_ps(fix3,fiy3);
777 fiz3 = _mm_hadd_ps(fiz3,fix4);
778 fiy4 = _mm_hadd_ps(fiy4,fiz4);
780 fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
781 fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
782 fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
784 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
785 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
786 _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
788 t5 = _mm_load_ss(fshiftptr+2);
789 t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
791 t1 = _mm_permute_ps(fix1,_MM_SHUFFLE(1,0,2,2));
792 t2 = _mm_permute_ps(fiy2,_MM_SHUFFLE(3,2,1,1));
793 t3 = _mm_permute_ps(fiz3,_MM_SHUFFLE(2,1,0,0));
794 t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));
795 t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3));
797 t1 = _mm_add_ps(t1,t2);
798 t3 = _mm_add_ps(t3,t4);
799 t1 = _mm_add_ps(t1,t3);
800 t5 = _mm_add_ps(t5,t1);
802 _mm_store_ss(fshiftptr+2,t5);
803 _mm_storeh_pi((__m64 *)(fshiftptr),t5);
808 static gmx_inline void
809 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
811 pot1 = _mm_hadd_ps(pot1,pot1);
812 pot1 = _mm_hadd_ps(pot1,pot1);
813 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
816 static gmx_inline void
817 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
818 __m128 pot2, float * gmx_restrict ptrB)
820 pot1 = _mm_hadd_ps(pot1,pot2);
821 pot1 = _mm_hadd_ps(pot1,pot1);
822 pot2 = _mm_permute_ps(pot1,_MM_SHUFFLE(0,0,0,1));
823 _mm_store_ss(ptrA,_mm_add_ss(pot1,_mm_load_ss(ptrA)));
824 _mm_store_ss(ptrB,_mm_add_ss(pot2,_mm_load_ss(ptrB)));
828 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */