2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_x86_avx_128_fma_single_h_
30 #define _kernelutil_x86_avx_128_fma_single_h_
35 #include "gmx_x86_avx_128_fma.h"
37 /* Normal sum of four xmm registers */
38 #define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
41 gmx_mm_any_lt(__m128 a, __m128 b)
43 return _mm_movemask_ps(_mm_cmplt_ps(a, b));
46 static gmx_inline __m128
47 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
49 return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
52 /* Load a single value from 1-4 places, merge into xmm register */
54 static gmx_inline __m128
55 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
56 const float * gmx_restrict ptrB,
57 const float * gmx_restrict ptrC,
58 const float * gmx_restrict ptrD)
62 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
63 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
64 return _mm_unpacklo_ps(t1, t2);
68 static gmx_inline void
69 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
70 float * gmx_restrict ptrB,
71 float * gmx_restrict ptrC,
72 float * gmx_restrict ptrD, __m128 xmm1)
76 t2 = _mm_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
77 t3 = _mm_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
78 t4 = _mm_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
79 _mm_store_ss(ptrA, xmm1);
80 _mm_store_ss(ptrB, t2);
81 _mm_store_ss(ptrC, t3);
82 _mm_store_ss(ptrD, t4);
86 static gmx_inline void
87 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
88 float * gmx_restrict ptrB,
89 float * gmx_restrict ptrC,
90 float * gmx_restrict ptrD, __m128 xmm1)
94 tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
95 tmp = _mm_add_ps(tmp, xmm1);
96 gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
100 static gmx_inline void
101 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
102 const float * gmx_restrict p2,
103 const float * gmx_restrict p3,
104 const float * gmx_restrict p4,
105 __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
107 __m128 t1, t2, t3, t4;
108 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1);
109 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2);
110 t3 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3);
111 t4 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4);
112 t1 = _mm_unpacklo_ps(t1, t3);
113 t2 = _mm_unpacklo_ps(t2, t4);
114 *c6 = _mm_unpacklo_ps(t1, t2);
115 *c12 = _mm_unpackhi_ps(t1, t2);
121 static gmx_inline void
122 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
123 const float * gmx_restrict xyz,
124 __m128 * gmx_restrict x1,
125 __m128 * gmx_restrict y1,
126 __m128 * gmx_restrict z1)
128 __m128 t1, t2, t3, t4;
130 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
131 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
132 t3 = _mm_load_ss(xyz_shift+2);
133 t4 = _mm_load_ss(xyz+2);
134 t1 = _mm_add_ps(t1, t2);
135 t3 = _mm_add_ss(t3, t4);
137 *x1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
138 *y1 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
139 *z1 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
143 static gmx_inline void
144 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
145 const float * gmx_restrict xyz,
146 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
147 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
148 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
151 __m128 t1, t2, t3, t4, t5, t6;
153 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
154 tB = _mm_load_ss(xyz_shift+2);
156 t1 = _mm_loadu_ps(xyz);
157 t2 = _mm_loadu_ps(xyz+4);
158 t3 = _mm_load_ss(xyz+8);
160 tA = _mm_movelh_ps(tA, tB);
161 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
162 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
163 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
165 t1 = _mm_add_ps(t1, t4);
166 t2 = _mm_add_ps(t2, t5);
167 t3 = _mm_add_ss(t3, t6);
169 *x1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
170 *y1 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
171 *z1 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
172 *x2 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
173 *y2 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
174 *z2 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
175 *x3 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
176 *y3 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
177 *z3 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
181 static gmx_inline void
182 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
183 const float * gmx_restrict xyz,
184 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
185 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
186 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
187 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
190 __m128 t1, t2, t3, t4, t5, t6;
192 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
193 tB = _mm_load_ss(xyz_shift+2);
195 t1 = _mm_loadu_ps(xyz);
196 t2 = _mm_loadu_ps(xyz+4);
197 t3 = _mm_loadu_ps(xyz+8);
199 tA = _mm_movelh_ps(tA, tB);
200 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
201 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
202 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
204 t1 = _mm_add_ps(t1, t4);
205 t2 = _mm_add_ps(t2, t5);
206 t3 = _mm_add_ps(t3, t6);
208 *x1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
209 *y1 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
210 *z1 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
211 *x2 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
212 *y2 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
213 *z2 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
214 *x3 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
215 *y3 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
216 *z3 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
217 *x4 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
218 *y4 = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
219 *z4 = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
223 static gmx_inline void
224 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
225 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
226 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
228 __m128 t1, t2, t3, t4;
229 __m128i mask = _mm_set_epi32(0, -1, -1, -1);
230 t1 = gmx_mm_maskload_ps(ptrA, mask);
231 t2 = gmx_mm_maskload_ps(ptrB, mask);
232 t3 = gmx_mm_maskload_ps(ptrC, mask);
233 t4 = gmx_mm_maskload_ps(ptrD, mask);
234 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
241 static gmx_inline void
242 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
243 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
244 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
245 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
246 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
248 __m128 t1, t2, t3, t4;
249 t1 = _mm_loadu_ps(ptrA);
250 t2 = _mm_loadu_ps(ptrB);
251 t3 = _mm_loadu_ps(ptrC);
252 t4 = _mm_loadu_ps(ptrD);
253 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
258 t1 = _mm_loadu_ps(ptrA+4);
259 t2 = _mm_loadu_ps(ptrB+4);
260 t3 = _mm_loadu_ps(ptrC+4);
261 t4 = _mm_loadu_ps(ptrD+4);
262 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
267 t1 = _mm_load_ss(ptrA+8);
268 t2 = _mm_load_ss(ptrB+8);
269 t3 = _mm_load_ss(ptrC+8);
270 t4 = _mm_load_ss(ptrD+8);
271 t1 = _mm_unpacklo_ps(t1, t3);
272 t3 = _mm_unpacklo_ps(t2, t4);
273 *z3 = _mm_unpacklo_ps(t1, t3);
277 static gmx_inline void
278 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
279 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
280 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
281 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
282 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
283 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
285 __m128 t1, t2, t3, t4;
286 t1 = _mm_loadu_ps(ptrA);
287 t2 = _mm_loadu_ps(ptrB);
288 t3 = _mm_loadu_ps(ptrC);
289 t4 = _mm_loadu_ps(ptrD);
290 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
295 t1 = _mm_loadu_ps(ptrA+4);
296 t2 = _mm_loadu_ps(ptrB+4);
297 t3 = _mm_loadu_ps(ptrC+4);
298 t4 = _mm_loadu_ps(ptrD+4);
299 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
304 t1 = _mm_loadu_ps(ptrA+8);
305 t2 = _mm_loadu_ps(ptrB+8);
306 t3 = _mm_loadu_ps(ptrC+8);
307 t4 = _mm_loadu_ps(ptrD+8);
308 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
316 static gmx_inline void
317 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
318 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
319 __m128 x1, __m128 y1, __m128 z1)
321 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
322 t5 = _mm_unpacklo_ps(y1, z1);
323 t6 = _mm_unpackhi_ps(y1, z1);
324 t7 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
325 t8 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
326 t9 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
327 t10 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
328 t1 = _mm_load_ss(ptrA);
329 t1 = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
330 t1 = _mm_sub_ps(t1, t7);
331 _mm_store_ss(ptrA, t1);
332 _mm_storeh_pi((__m64 *)(ptrA+1), t1);
333 t2 = _mm_load_ss(ptrB);
334 t2 = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
335 t2 = _mm_sub_ps(t2, t8);
336 _mm_store_ss(ptrB, t2);
337 _mm_storeh_pi((__m64 *)(ptrB+1), t2);
338 t3 = _mm_load_ss(ptrC);
339 t3 = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
340 t3 = _mm_sub_ps(t3, t9);
341 _mm_store_ss(ptrC, t3);
342 _mm_storeh_pi((__m64 *)(ptrC+1), t3);
343 t4 = _mm_load_ss(ptrD);
344 t4 = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
345 t4 = _mm_sub_ps(t4, t10);
346 _mm_store_ss(ptrD, t4);
347 _mm_storeh_pi((__m64 *)(ptrD+1), t4);
351 #if defined (_MSC_VER) && defined(_M_IX86)
352 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
353 #define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
354 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
356 __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
357 __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
358 __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
359 _t13 = _mm_unpackhi_ps(_x1, _y1); \
360 _x1 = _mm_unpacklo_ps(_x1, _y1); \
361 _t14 = _mm_unpackhi_ps(_z1, _x2); \
362 _z1 = _mm_unpacklo_ps(_z1, _x2); \
363 _t15 = _mm_unpackhi_ps(_y2, _z2); \
364 _y2 = _mm_unpacklo_ps(_y2, _z2); \
365 _t16 = _mm_unpackhi_ps(_x3, _y3); \
366 _x3 = _mm_unpacklo_ps(_x3, _y3); \
367 _t17 = _mm_permute_ps(_z3, _MM_SHUFFLE(0, 0, 0, 1)); \
368 _t18 = _mm_movehl_ps(_z3, _z3); \
369 _t19 = _mm_permute_ps(_t18, _MM_SHUFFLE(0, 0, 0, 1)); \
370 _t20 = _mm_movelh_ps(_x1, _z1); \
371 _t21 = _mm_movehl_ps(_z1, _x1); \
372 _t22 = _mm_movelh_ps(_t13, _t14); \
373 _t14 = _mm_movehl_ps(_t14, _t13); \
374 _t23 = _mm_movelh_ps(_y2, _x3); \
375 _t24 = _mm_movehl_ps(_x3, _y2); \
376 _t25 = _mm_movelh_ps(_t15, _t16); \
377 _t16 = _mm_movehl_ps(_t16, _t15); \
378 _t1 = _mm_loadu_ps(ptrA); \
379 _t2 = _mm_loadu_ps(ptrA+4); \
380 _t3 = _mm_load_ss(ptrA+8); \
381 _t1 = _mm_sub_ps(_t1, _t20); \
382 _t2 = _mm_sub_ps(_t2, _t23); \
383 _t3 = _mm_sub_ss(_t3, _z3); \
384 _mm_storeu_ps(ptrA, _t1); \
385 _mm_storeu_ps(ptrA+4, _t2); \
386 _mm_store_ss(ptrA+8, _t3); \
387 _t4 = _mm_loadu_ps(ptrB); \
388 _t5 = _mm_loadu_ps(ptrB+4); \
389 _t6 = _mm_load_ss(ptrB+8); \
390 _t4 = _mm_sub_ps(_t4, _t21); \
391 _t5 = _mm_sub_ps(_t5, _t24); \
392 _t6 = _mm_sub_ss(_t6, _t17); \
393 _mm_storeu_ps(ptrB, _t4); \
394 _mm_storeu_ps(ptrB+4, _t5); \
395 _mm_store_ss(ptrB+8, _t6); \
396 _t7 = _mm_loadu_ps(ptrC); \
397 _t8 = _mm_loadu_ps(ptrC+4); \
398 _t9 = _mm_load_ss(ptrC+8); \
399 _t7 = _mm_sub_ps(_t7, _t22); \
400 _t8 = _mm_sub_ps(_t8, _t25); \
401 _t9 = _mm_sub_ss(_t9, _t18); \
402 _mm_storeu_ps(ptrC, _t7); \
403 _mm_storeu_ps(ptrC+4, _t8); \
404 _mm_store_ss(ptrC+8, _t9); \
405 _t10 = _mm_loadu_ps(ptrD); \
406 _t11 = _mm_loadu_ps(ptrD+4); \
407 _t12 = _mm_load_ss(ptrD+8); \
408 _t10 = _mm_sub_ps(_t10, _t14); \
409 _t11 = _mm_sub_ps(_t11, _t16); \
410 _t12 = _mm_sub_ss(_t12, _t19); \
411 _mm_storeu_ps(ptrD, _t10); \
412 _mm_storeu_ps(ptrD+4, _t11); \
413 _mm_store_ss(ptrD+8, _t12); \
416 /* Real function for sane compilers */
417 static gmx_inline void
418 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
419 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
420 __m128 x1, __m128 y1, __m128 z1,
421 __m128 x2, __m128 y2, __m128 z2,
422 __m128 x3, __m128 y3, __m128 z3)
424 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
425 __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
426 __m128 t20, t21, t22, t23, t24, t25;
427 t13 = _mm_unpackhi_ps(x1, y1);
428 x1 = _mm_unpacklo_ps(x1, y1);
429 t14 = _mm_unpackhi_ps(z1, x2);
430 z1 = _mm_unpacklo_ps(z1, x2);
431 t15 = _mm_unpackhi_ps(y2, z2);
432 y2 = _mm_unpacklo_ps(y2, z2);
433 t16 = _mm_unpackhi_ps(x3, y3);
434 x3 = _mm_unpacklo_ps(x3, y3);
435 t17 = _mm_permute_ps(z3, _MM_SHUFFLE(0, 0, 0, 1));
436 t18 = _mm_movehl_ps(z3, z3);
437 t19 = _mm_permute_ps(t18, _MM_SHUFFLE(0, 0, 0, 1));
438 t20 = _mm_movelh_ps(x1, z1);
439 t21 = _mm_movehl_ps(z1, x1);
440 t22 = _mm_movelh_ps(t13, t14);
441 t14 = _mm_movehl_ps(t14, t13);
442 t23 = _mm_movelh_ps(y2, x3);
443 t24 = _mm_movehl_ps(x3, y2);
444 t25 = _mm_movelh_ps(t15, t16);
445 t16 = _mm_movehl_ps(t16, t15);
446 t1 = _mm_loadu_ps(ptrA);
447 t2 = _mm_loadu_ps(ptrA+4);
448 t3 = _mm_load_ss(ptrA+8);
449 t1 = _mm_sub_ps(t1, t20);
450 t2 = _mm_sub_ps(t2, t23);
451 t3 = _mm_sub_ss(t3, z3);
452 _mm_storeu_ps(ptrA, t1);
453 _mm_storeu_ps(ptrA+4, t2);
454 _mm_store_ss(ptrA+8, t3);
455 t4 = _mm_loadu_ps(ptrB);
456 t5 = _mm_loadu_ps(ptrB+4);
457 t6 = _mm_load_ss(ptrB+8);
458 t4 = _mm_sub_ps(t4, t21);
459 t5 = _mm_sub_ps(t5, t24);
460 t6 = _mm_sub_ss(t6, t17);
461 _mm_storeu_ps(ptrB, t4);
462 _mm_storeu_ps(ptrB+4, t5);
463 _mm_store_ss(ptrB+8, t6);
464 t7 = _mm_loadu_ps(ptrC);
465 t8 = _mm_loadu_ps(ptrC+4);
466 t9 = _mm_load_ss(ptrC+8);
467 t7 = _mm_sub_ps(t7, t22);
468 t8 = _mm_sub_ps(t8, t25);
469 t9 = _mm_sub_ss(t9, t18);
470 _mm_storeu_ps(ptrC, t7);
471 _mm_storeu_ps(ptrC+4, t8);
472 _mm_store_ss(ptrC+8, t9);
473 t10 = _mm_loadu_ps(ptrD);
474 t11 = _mm_loadu_ps(ptrD+4);
475 t12 = _mm_load_ss(ptrD+8);
476 t10 = _mm_sub_ps(t10, t14);
477 t11 = _mm_sub_ps(t11, t16);
478 t12 = _mm_sub_ss(t12, t19);
479 _mm_storeu_ps(ptrD, t10);
480 _mm_storeu_ps(ptrD+4, t11);
481 _mm_store_ss(ptrD+8, t12);
485 #if defined (_MSC_VER) && defined(_M_IX86)
486 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
487 #define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
488 _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
490 __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
491 __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
493 _t13 = _mm_unpackhi_ps(_x1, _y1); \
494 _x1 = _mm_unpacklo_ps(_x1, _y1); \
495 _t14 = _mm_unpackhi_ps(_z1, _x2); \
496 _z1 = _mm_unpacklo_ps(_z1, _x2); \
497 _t15 = _mm_unpackhi_ps(_y2, _z2); \
498 _y2 = _mm_unpacklo_ps(_y2, _z2); \
499 _t16 = _mm_unpackhi_ps(_x3, _y3); \
500 _x3 = _mm_unpacklo_ps(_x3, _y3); \
501 _t17 = _mm_unpackhi_ps(_z3, _x4); \
502 _z3 = _mm_unpacklo_ps(_z3, _x4); \
503 _t18 = _mm_unpackhi_ps(_y4, _z4); \
504 _y4 = _mm_unpacklo_ps(_y4, _z4); \
505 _t19 = _mm_movelh_ps(_x1, _z1); \
506 _z1 = _mm_movehl_ps(_z1, _x1); \
507 _t20 = _mm_movelh_ps(_t13, _t14); \
508 _t14 = _mm_movehl_ps(_t14, _t13); \
509 _t21 = _mm_movelh_ps(_y2, _x3); \
510 _x3 = _mm_movehl_ps(_x3, _y2); \
511 _t22 = _mm_movelh_ps(_t15, _t16); \
512 _t16 = _mm_movehl_ps(_t16, _t15); \
513 _t23 = _mm_movelh_ps(_z3, _y4); \
514 _y4 = _mm_movehl_ps(_y4, _z3); \
515 _t24 = _mm_movelh_ps(_t17, _t18); \
516 _t18 = _mm_movehl_ps(_t18, _t17); \
517 _t1 = _mm_loadu_ps(ptrA); \
518 _t2 = _mm_loadu_ps(ptrA+4); \
519 _t3 = _mm_loadu_ps(ptrA+8); \
520 _t1 = _mm_sub_ps(_t1, _t19); \
521 _t2 = _mm_sub_ps(_t2, _t21); \
522 _t3 = _mm_sub_ps(_t3, _t23); \
523 _mm_storeu_ps(ptrA, _t1); \
524 _mm_storeu_ps(ptrA+4, _t2); \
525 _mm_storeu_ps(ptrA+8, _t3); \
526 _t4 = _mm_loadu_ps(ptrB); \
527 _t5 = _mm_loadu_ps(ptrB+4); \
528 _t6 = _mm_loadu_ps(ptrB+8); \
529 _t4 = _mm_sub_ps(_t4, _z1); \
530 _t5 = _mm_sub_ps(_t5, _x3); \
531 _t6 = _mm_sub_ps(_t6, _y4); \
532 _mm_storeu_ps(ptrB, _t4); \
533 _mm_storeu_ps(ptrB+4, _t5); \
534 _mm_storeu_ps(ptrB+8, _t6); \
535 _t7 = _mm_loadu_ps(ptrC); \
536 _t8 = _mm_loadu_ps(ptrC+4); \
537 _t9 = _mm_loadu_ps(ptrC+8); \
538 _t7 = _mm_sub_ps(_t7, _t20); \
539 _t8 = _mm_sub_ps(_t8, _t22); \
540 _t9 = _mm_sub_ps(_t9, _t24); \
541 _mm_storeu_ps(ptrC, _t7); \
542 _mm_storeu_ps(ptrC+4, _t8); \
543 _mm_storeu_ps(ptrC+8, _t9); \
544 _t10 = _mm_loadu_ps(ptrD); \
545 _t11 = _mm_loadu_ps(ptrD+4); \
546 _t12 = _mm_loadu_ps(ptrD+8); \
547 _t10 = _mm_sub_ps(_t10, _t14); \
548 _t11 = _mm_sub_ps(_t11, _t16); \
549 _t12 = _mm_sub_ps(_t12, _t18); \
550 _mm_storeu_ps(ptrD, _t10); \
551 _mm_storeu_ps(ptrD+4, _t11); \
552 _mm_storeu_ps(ptrD+8, _t12); \
555 /* Real function for sane compilers */
556 static gmx_inline void
557 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
558 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
559 __m128 x1, __m128 y1, __m128 z1,
560 __m128 x2, __m128 y2, __m128 z2,
561 __m128 x3, __m128 y3, __m128 z3,
562 __m128 x4, __m128 y4, __m128 z4)
564 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
565 __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
567 t13 = _mm_unpackhi_ps(x1, y1);
568 x1 = _mm_unpacklo_ps(x1, y1);
569 t14 = _mm_unpackhi_ps(z1, x2);
570 z1 = _mm_unpacklo_ps(z1, x2);
571 t15 = _mm_unpackhi_ps(y2, z2);
572 y2 = _mm_unpacklo_ps(y2, z2);
573 t16 = _mm_unpackhi_ps(x3, y3);
574 x3 = _mm_unpacklo_ps(x3, y3);
575 t17 = _mm_unpackhi_ps(z3, x4);
576 z3 = _mm_unpacklo_ps(z3, x4);
577 t18 = _mm_unpackhi_ps(y4, z4);
578 y4 = _mm_unpacklo_ps(y4, z4);
579 t19 = _mm_movelh_ps(x1, z1);
580 z1 = _mm_movehl_ps(z1, x1);
581 t20 = _mm_movelh_ps(t13, t14);
582 t14 = _mm_movehl_ps(t14, t13);
583 t21 = _mm_movelh_ps(y2, x3);
584 x3 = _mm_movehl_ps(x3, y2);
585 t22 = _mm_movelh_ps(t15, t16);
586 t16 = _mm_movehl_ps(t16, t15);
587 t23 = _mm_movelh_ps(z3, y4);
588 y4 = _mm_movehl_ps(y4, z3);
589 t24 = _mm_movelh_ps(t17, t18);
590 t18 = _mm_movehl_ps(t18, t17);
591 t1 = _mm_loadu_ps(ptrA);
592 t2 = _mm_loadu_ps(ptrA+4);
593 t3 = _mm_loadu_ps(ptrA+8);
594 t1 = _mm_sub_ps(t1, t19);
595 t2 = _mm_sub_ps(t2, t21);
596 t3 = _mm_sub_ps(t3, t23);
597 _mm_storeu_ps(ptrA, t1);
598 _mm_storeu_ps(ptrA+4, t2);
599 _mm_storeu_ps(ptrA+8, t3);
600 t4 = _mm_loadu_ps(ptrB);
601 t5 = _mm_loadu_ps(ptrB+4);
602 t6 = _mm_loadu_ps(ptrB+8);
603 t4 = _mm_sub_ps(t4, z1);
604 t5 = _mm_sub_ps(t5, x3);
605 t6 = _mm_sub_ps(t6, y4);
606 _mm_storeu_ps(ptrB, t4);
607 _mm_storeu_ps(ptrB+4, t5);
608 _mm_storeu_ps(ptrB+8, t6);
609 t7 = _mm_loadu_ps(ptrC);
610 t8 = _mm_loadu_ps(ptrC+4);
611 t9 = _mm_loadu_ps(ptrC+8);
612 t7 = _mm_sub_ps(t7, t20);
613 t8 = _mm_sub_ps(t8, t22);
614 t9 = _mm_sub_ps(t9, t24);
615 _mm_storeu_ps(ptrC, t7);
616 _mm_storeu_ps(ptrC+4, t8);
617 _mm_storeu_ps(ptrC+8, t9);
618 t10 = _mm_loadu_ps(ptrD);
619 t11 = _mm_loadu_ps(ptrD+4);
620 t12 = _mm_loadu_ps(ptrD+8);
621 t10 = _mm_sub_ps(t10, t14);
622 t11 = _mm_sub_ps(t11, t16);
623 t12 = _mm_sub_ps(t12, t18);
624 _mm_storeu_ps(ptrD, t10);
625 _mm_storeu_ps(ptrD+4, t11);
626 _mm_storeu_ps(ptrD+8, t12);
630 static gmx_inline void
631 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
632 float * gmx_restrict fptr,
633 float * gmx_restrict fshiftptr)
637 fix1 = _mm_hadd_ps(fix1, fix1);
638 fiy1 = _mm_hadd_ps(fiy1, fiz1);
640 fix1 = _mm_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 */
642 t2 = _mm_load_ss(fptr);
643 t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
644 t3 = _mm_load_ss(fshiftptr);
645 t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
647 t2 = _mm_add_ps(t2, fix1);
648 t3 = _mm_add_ps(t3, fix1);
650 _mm_store_ss(fptr, t2);
651 _mm_storeh_pi((__m64 *)(fptr+1), t2);
652 _mm_store_ss(fshiftptr, t3);
653 _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
656 #if defined (_MSC_VER) && defined(_M_IX86)
657 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
658 #define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
661 __m128 _t1, _t2, _t3, _t4; \
663 fix1 = _mm_hadd_ps(fix1, fiy1); \
664 fiz1 = _mm_hadd_ps(fiz1, fix2); \
665 fiy2 = _mm_hadd_ps(fiy2, fiz2); \
666 fix3 = _mm_hadd_ps(fix3, fiy3); \
667 fiz3 = _mm_hadd_ps(fiz3, fiz3); \
668 fix1 = _mm_hadd_ps(fix1, fiz1); \
669 fiy2 = _mm_hadd_ps(fiy2, fix3); \
670 fiz3 = _mm_hadd_ps(fiz3, fiz3); \
671 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
672 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
673 _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
674 _t4 = _mm_load_ss(fshiftptr+2); \
675 _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
676 _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
677 _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
678 _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
679 _t3 = _mm_permute_ps(_t3, _MM_SHUFFLE(1, 2, 0, 0)); \
680 _t1 = _mm_add_ps(_t1, _t2); \
681 _t3 = _mm_add_ps(_t3, _t4); \
682 _t1 = _mm_add_ps(_t1, _t3); \
683 _mm_store_ss(fshiftptr+2, _t1); \
684 _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
687 /* Real function for sane compilers */
688 static gmx_inline void
689 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
690 __m128 fix2, __m128 fiy2, __m128 fiz2,
691 __m128 fix3, __m128 fiy3, __m128 fiz3,
692 float * gmx_restrict fptr,
693 float * gmx_restrict fshiftptr)
695 __m128 t1, t2, t3, t4;
697 fix1 = _mm_hadd_ps(fix1, fiy1);
698 fiz1 = _mm_hadd_ps(fiz1, fix2);
699 fiy2 = _mm_hadd_ps(fiy2, fiz2);
700 fix3 = _mm_hadd_ps(fix3, fiy3);
701 fiz3 = _mm_hadd_ps(fiz3, fiz3);
703 fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
704 fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
705 fiz3 = _mm_hadd_ps(fiz3, fiz3); /* - - - fiz3 */
707 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
708 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
709 _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));
711 t4 = _mm_load_ss(fshiftptr+2);
712 t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));
714 t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1 - fiz3 */
715 t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3 - fiz1 */
716 t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
717 t3 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 2, 0, 0)); /* fiy2 fix2 - fiz2 */
719 t1 = _mm_add_ps(t1, t2);
720 t3 = _mm_add_ps(t3, t4);
721 t1 = _mm_add_ps(t1, t3); /* y x - z */
723 _mm_store_ss(fshiftptr+2, t1);
724 _mm_storeh_pi((__m64 *)(fshiftptr), t1);
728 #if defined (_MSC_VER) && defined(_M_IX86)
729 /* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
730 #define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
733 __m128 _t1, _t2, _t3, _t4, _t5; \
735 fix1 = _mm_hadd_ps(fix1, fiy1); \
736 fiz1 = _mm_hadd_ps(fiz1, fix2); \
737 fiy2 = _mm_hadd_ps(fiy2, fiz2); \
738 fix3 = _mm_hadd_ps(fix3, fiy3); \
739 fiz3 = _mm_hadd_ps(fiz3, fix4); \
740 fiy4 = _mm_hadd_ps(fiy4, fiz4); \
741 fix1 = _mm_hadd_ps(fix1, fiz1); \
742 fiy2 = _mm_hadd_ps(fiy2, fix3); \
743 fiz3 = _mm_hadd_ps(fiz3, fiy4); \
744 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
745 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
746 _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
747 _t5 = _mm_load_ss(fshiftptr+2); \
748 _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
749 _t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
750 _t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
751 _t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
752 _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
753 _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
754 _t1 = _mm_add_ps(_t1, _t2); \
755 _t3 = _mm_add_ps(_t3, _t4); \
756 _t1 = _mm_add_ps(_t1, _t3); \
757 _t5 = _mm_add_ps(_t5, _t1); \
758 _mm_store_ss(fshiftptr+2, _t5); \
759 _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
762 /* Real function for sane compilers */
763 static gmx_inline void
764 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
765 __m128 fix2, __m128 fiy2, __m128 fiz2,
766 __m128 fix3, __m128 fiy3, __m128 fiz3,
767 __m128 fix4, __m128 fiy4, __m128 fiz4,
768 float * gmx_restrict fptr,
769 float * gmx_restrict fshiftptr)
771 __m128 t1, t2, t3, t4, t5;
773 fix1 = _mm_hadd_ps(fix1, fiy1);
774 fiz1 = _mm_hadd_ps(fiz1, fix2);
775 fiy2 = _mm_hadd_ps(fiy2, fiz2);
776 fix3 = _mm_hadd_ps(fix3, fiy3);
777 fiz3 = _mm_hadd_ps(fiz3, fix4);
778 fiy4 = _mm_hadd_ps(fiy4, fiz4);
780 fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
781 fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
782 fiz3 = _mm_hadd_ps(fiz3, fiy4); /* fiz4 fiy4 fix4 fiz3 */
784 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
785 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
786 _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));
788 t5 = _mm_load_ss(fshiftptr+2);
789 t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));
791 t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2));
792 t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1));
793 t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0));
794 t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
795 t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));
797 t1 = _mm_add_ps(t1, t2);
798 t3 = _mm_add_ps(t3, t4);
799 t1 = _mm_add_ps(t1, t3);
800 t5 = _mm_add_ps(t5, t1);
802 _mm_store_ss(fshiftptr+2, t5);
803 _mm_storeh_pi((__m64 *)(fshiftptr), t5);
808 static gmx_inline void
809 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
811 pot1 = _mm_hadd_ps(pot1, pot1);
812 pot1 = _mm_hadd_ps(pot1, pot1);
813 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
816 static gmx_inline void
817 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
818 __m128 pot2, float * gmx_restrict ptrB)
820 pot1 = _mm_hadd_ps(pot1, pot2);
821 pot1 = _mm_hadd_ps(pot1, pot1);
822 pot2 = _mm_permute_ps(pot1, _MM_SHUFFLE(0, 0, 0, 1));
823 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
824 _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
828 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */