2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_x86_avx_128_fma_single_h_
36 #define _kernelutil_x86_avx_128_fma_single_h_
41 #include <immintrin.h>
45 # include <x86intrin.h>
48 #define gmx_mm_castsi128_ps _mm_castsi128_ps
49 #define gmx_mm_extract_epi32 _mm_extract_epi32
51 /* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
52 #ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
53 # define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
54 # define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
55 # define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
56 # define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
58 # define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), (mask))
59 # define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), (mask), (x))
60 # define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), (mask))
61 # define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
64 /* Normal sum of four xmm registers */
65 #define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
67 static gmx_inline int gmx_simdcall
68 gmx_mm_any_lt(__m128 a, __m128 b)
70 return _mm_movemask_ps(_mm_cmplt_ps(a, b));
73 static gmx_inline __m128 gmx_simdcall
74 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
76 return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
79 /* Load a single value from 1-4 places, merge into xmm register */
81 static gmx_inline __m128 gmx_simdcall
82 gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
83 const float * gmx_restrict ptrB,
84 const float * gmx_restrict ptrC,
85 const float * gmx_restrict ptrD)
89 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
90 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
91 return _mm_unpacklo_ps(t1, t2);
95 static gmx_inline void gmx_simdcall
96 gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
97 float * gmx_restrict ptrB,
98 float * gmx_restrict ptrC,
99 float * gmx_restrict ptrD, __m128 xmm1)
103 t2 = _mm_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
104 t3 = _mm_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
105 t4 = _mm_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
106 _mm_store_ss(ptrA, xmm1);
107 _mm_store_ss(ptrB, t2);
108 _mm_store_ss(ptrC, t3);
109 _mm_store_ss(ptrD, t4);
113 static gmx_inline void gmx_simdcall
114 gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
115 float * gmx_restrict ptrB,
116 float * gmx_restrict ptrC,
117 float * gmx_restrict ptrD, __m128 xmm1)
121 tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
122 tmp = _mm_add_ps(tmp, xmm1);
123 gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
127 static gmx_inline void gmx_simdcall
128 gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
129 const float * gmx_restrict p2,
130 const float * gmx_restrict p3,
131 const float * gmx_restrict p4,
132 __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
134 __m128 t1, t2, t3, t4;
135 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1);
136 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2);
137 t3 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3);
138 t4 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4);
139 t1 = _mm_unpacklo_ps(t1, t3);
140 t2 = _mm_unpacklo_ps(t2, t4);
141 *c6 = _mm_unpacklo_ps(t1, t2);
142 *c12 = _mm_unpackhi_ps(t1, t2);
148 static gmx_inline void gmx_simdcall
149 gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
150 const float * gmx_restrict xyz,
151 __m128 * gmx_restrict x1,
152 __m128 * gmx_restrict y1,
153 __m128 * gmx_restrict z1)
155 __m128 t1, t2, t3, t4;
157 t1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
158 t2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
159 t3 = _mm_load_ss(xyz_shift+2);
160 t4 = _mm_load_ss(xyz+2);
161 t1 = _mm_add_ps(t1, t2);
162 t3 = _mm_add_ss(t3, t4);
164 *x1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
165 *y1 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
166 *z1 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
170 static gmx_inline void gmx_simdcall
171 gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
172 const float * gmx_restrict xyz,
173 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
174 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
175 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
178 __m128 t1, t2, t3, t4, t5, t6;
180 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
181 tB = _mm_load_ss(xyz_shift+2);
183 t1 = _mm_loadu_ps(xyz);
184 t2 = _mm_loadu_ps(xyz+4);
185 t3 = _mm_load_ss(xyz+8);
187 tA = _mm_movelh_ps(tA, tB);
188 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
189 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
190 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
192 t1 = _mm_add_ps(t1, t4);
193 t2 = _mm_add_ps(t2, t5);
194 t3 = _mm_add_ss(t3, t6);
196 *x1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
197 *y1 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
198 *z1 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
199 *x2 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
200 *y2 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
201 *z2 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
202 *x3 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
203 *y3 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
204 *z3 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
208 static gmx_inline void gmx_simdcall
209 gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
210 const float * gmx_restrict xyz,
211 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
212 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
213 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
214 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
217 __m128 t1, t2, t3, t4, t5, t6;
219 tA = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
220 tB = _mm_load_ss(xyz_shift+2);
222 t1 = _mm_loadu_ps(xyz);
223 t2 = _mm_loadu_ps(xyz+4);
224 t3 = _mm_loadu_ps(xyz+8);
226 tA = _mm_movelh_ps(tA, tB);
227 t4 = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
228 t5 = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
229 t6 = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));
231 t1 = _mm_add_ps(t1, t4);
232 t2 = _mm_add_ps(t2, t5);
233 t3 = _mm_add_ps(t3, t6);
235 *x1 = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
236 *y1 = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
237 *z1 = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
238 *x2 = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
239 *y2 = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
240 *z2 = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
241 *x3 = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
242 *y3 = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
243 *z3 = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
244 *x4 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
245 *y4 = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
246 *z4 = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
250 static gmx_inline void gmx_simdcall
251 gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
252 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
253 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
255 __m128 t1, t2, t3, t4;
256 __m128i mask = _mm_set_epi32(0, -1, -1, -1);
257 t1 = gmx_mm_maskload_ps(ptrA, mask);
258 t2 = gmx_mm_maskload_ps(ptrB, mask);
259 t3 = gmx_mm_maskload_ps(ptrC, mask);
260 t4 = gmx_mm_maskload_ps(ptrD, mask);
261 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
268 static gmx_inline void gmx_simdcall
269 gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
270 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
271 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
272 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
273 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
275 __m128 t1, t2, t3, t4;
276 t1 = _mm_loadu_ps(ptrA);
277 t2 = _mm_loadu_ps(ptrB);
278 t3 = _mm_loadu_ps(ptrC);
279 t4 = _mm_loadu_ps(ptrD);
280 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
285 t1 = _mm_loadu_ps(ptrA+4);
286 t2 = _mm_loadu_ps(ptrB+4);
287 t3 = _mm_loadu_ps(ptrC+4);
288 t4 = _mm_loadu_ps(ptrD+4);
289 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
294 t1 = _mm_load_ss(ptrA+8);
295 t2 = _mm_load_ss(ptrB+8);
296 t3 = _mm_load_ss(ptrC+8);
297 t4 = _mm_load_ss(ptrD+8);
298 t1 = _mm_unpacklo_ps(t1, t3);
299 t3 = _mm_unpacklo_ps(t2, t4);
300 *z3 = _mm_unpacklo_ps(t1, t3);
304 static gmx_inline void gmx_simdcall
305 gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
306 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
307 __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
308 __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
309 __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
310 __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
312 __m128 t1, t2, t3, t4;
313 t1 = _mm_loadu_ps(ptrA);
314 t2 = _mm_loadu_ps(ptrB);
315 t3 = _mm_loadu_ps(ptrC);
316 t4 = _mm_loadu_ps(ptrD);
317 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
322 t1 = _mm_loadu_ps(ptrA+4);
323 t2 = _mm_loadu_ps(ptrB+4);
324 t3 = _mm_loadu_ps(ptrC+4);
325 t4 = _mm_loadu_ps(ptrD+4);
326 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
331 t1 = _mm_loadu_ps(ptrA+8);
332 t2 = _mm_loadu_ps(ptrB+8);
333 t3 = _mm_loadu_ps(ptrC+8);
334 t4 = _mm_loadu_ps(ptrD+8);
335 _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
343 static gmx_inline void gmx_simdcall
344 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
345 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
346 __m128 x1, __m128 y1, __m128 z1)
348 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
349 t5 = _mm_unpacklo_ps(y1, z1);
350 t6 = _mm_unpackhi_ps(y1, z1);
351 t7 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
352 t8 = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
353 t9 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
354 t10 = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
355 t1 = _mm_load_ss(ptrA);
356 t1 = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
357 t1 = _mm_sub_ps(t1, t7);
358 _mm_store_ss(ptrA, t1);
359 _mm_storeh_pi((__m64 *)(ptrA+1), t1);
360 t2 = _mm_load_ss(ptrB);
361 t2 = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
362 t2 = _mm_sub_ps(t2, t8);
363 _mm_store_ss(ptrB, t2);
364 _mm_storeh_pi((__m64 *)(ptrB+1), t2);
365 t3 = _mm_load_ss(ptrC);
366 t3 = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
367 t3 = _mm_sub_ps(t3, t9);
368 _mm_store_ss(ptrC, t3);
369 _mm_storeh_pi((__m64 *)(ptrC+1), t3);
370 t4 = _mm_load_ss(ptrD);
371 t4 = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
372 t4 = _mm_sub_ps(t4, t10);
373 _mm_store_ss(ptrD, t4);
374 _mm_storeh_pi((__m64 *)(ptrD+1), t4);
378 static gmx_inline void gmx_simdcall
379 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
380 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
381 __m128 x1, __m128 y1, __m128 z1,
382 __m128 x2, __m128 y2, __m128 z2,
383 __m128 x3, __m128 y3, __m128 z3)
385 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
386 __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
387 __m128 t20, t21, t22, t23, t24, t25;
388 t13 = _mm_unpackhi_ps(x1, y1);
389 x1 = _mm_unpacklo_ps(x1, y1);
390 t14 = _mm_unpackhi_ps(z1, x2);
391 z1 = _mm_unpacklo_ps(z1, x2);
392 t15 = _mm_unpackhi_ps(y2, z2);
393 y2 = _mm_unpacklo_ps(y2, z2);
394 t16 = _mm_unpackhi_ps(x3, y3);
395 x3 = _mm_unpacklo_ps(x3, y3);
396 t17 = _mm_permute_ps(z3, _MM_SHUFFLE(0, 0, 0, 1));
397 t18 = _mm_movehl_ps(z3, z3);
398 t19 = _mm_permute_ps(t18, _MM_SHUFFLE(0, 0, 0, 1));
399 t20 = _mm_movelh_ps(x1, z1);
400 t21 = _mm_movehl_ps(z1, x1);
401 t22 = _mm_movelh_ps(t13, t14);
402 t14 = _mm_movehl_ps(t14, t13);
403 t23 = _mm_movelh_ps(y2, x3);
404 t24 = _mm_movehl_ps(x3, y2);
405 t25 = _mm_movelh_ps(t15, t16);
406 t16 = _mm_movehl_ps(t16, t15);
407 t1 = _mm_loadu_ps(ptrA);
408 t2 = _mm_loadu_ps(ptrA+4);
409 t3 = _mm_load_ss(ptrA+8);
410 t1 = _mm_sub_ps(t1, t20);
411 t2 = _mm_sub_ps(t2, t23);
412 t3 = _mm_sub_ss(t3, z3);
413 _mm_storeu_ps(ptrA, t1);
414 _mm_storeu_ps(ptrA+4, t2);
415 _mm_store_ss(ptrA+8, t3);
416 t4 = _mm_loadu_ps(ptrB);
417 t5 = _mm_loadu_ps(ptrB+4);
418 t6 = _mm_load_ss(ptrB+8);
419 t4 = _mm_sub_ps(t4, t21);
420 t5 = _mm_sub_ps(t5, t24);
421 t6 = _mm_sub_ss(t6, t17);
422 _mm_storeu_ps(ptrB, t4);
423 _mm_storeu_ps(ptrB+4, t5);
424 _mm_store_ss(ptrB+8, t6);
425 t7 = _mm_loadu_ps(ptrC);
426 t8 = _mm_loadu_ps(ptrC+4);
427 t9 = _mm_load_ss(ptrC+8);
428 t7 = _mm_sub_ps(t7, t22);
429 t8 = _mm_sub_ps(t8, t25);
430 t9 = _mm_sub_ss(t9, t18);
431 _mm_storeu_ps(ptrC, t7);
432 _mm_storeu_ps(ptrC+4, t8);
433 _mm_store_ss(ptrC+8, t9);
434 t10 = _mm_loadu_ps(ptrD);
435 t11 = _mm_loadu_ps(ptrD+4);
436 t12 = _mm_load_ss(ptrD+8);
437 t10 = _mm_sub_ps(t10, t14);
438 t11 = _mm_sub_ps(t11, t16);
439 t12 = _mm_sub_ss(t12, t19);
440 _mm_storeu_ps(ptrD, t10);
441 _mm_storeu_ps(ptrD+4, t11);
442 _mm_store_ss(ptrD+8, t12);
446 static gmx_inline void gmx_simdcall
447 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
448 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
449 __m128 x1, __m128 y1, __m128 z1,
450 __m128 x2, __m128 y2, __m128 z2,
451 __m128 x3, __m128 y3, __m128 z3,
452 __m128 x4, __m128 y4, __m128 z4)
454 __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
455 __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
457 t13 = _mm_unpackhi_ps(x1, y1);
458 x1 = _mm_unpacklo_ps(x1, y1);
459 t14 = _mm_unpackhi_ps(z1, x2);
460 z1 = _mm_unpacklo_ps(z1, x2);
461 t15 = _mm_unpackhi_ps(y2, z2);
462 y2 = _mm_unpacklo_ps(y2, z2);
463 t16 = _mm_unpackhi_ps(x3, y3);
464 x3 = _mm_unpacklo_ps(x3, y3);
465 t17 = _mm_unpackhi_ps(z3, x4);
466 z3 = _mm_unpacklo_ps(z3, x4);
467 t18 = _mm_unpackhi_ps(y4, z4);
468 y4 = _mm_unpacklo_ps(y4, z4);
469 t19 = _mm_movelh_ps(x1, z1);
470 z1 = _mm_movehl_ps(z1, x1);
471 t20 = _mm_movelh_ps(t13, t14);
472 t14 = _mm_movehl_ps(t14, t13);
473 t21 = _mm_movelh_ps(y2, x3);
474 x3 = _mm_movehl_ps(x3, y2);
475 t22 = _mm_movelh_ps(t15, t16);
476 t16 = _mm_movehl_ps(t16, t15);
477 t23 = _mm_movelh_ps(z3, y4);
478 y4 = _mm_movehl_ps(y4, z3);
479 t24 = _mm_movelh_ps(t17, t18);
480 t18 = _mm_movehl_ps(t18, t17);
481 t1 = _mm_loadu_ps(ptrA);
482 t2 = _mm_loadu_ps(ptrA+4);
483 t3 = _mm_loadu_ps(ptrA+8);
484 t1 = _mm_sub_ps(t1, t19);
485 t2 = _mm_sub_ps(t2, t21);
486 t3 = _mm_sub_ps(t3, t23);
487 _mm_storeu_ps(ptrA, t1);
488 _mm_storeu_ps(ptrA+4, t2);
489 _mm_storeu_ps(ptrA+8, t3);
490 t4 = _mm_loadu_ps(ptrB);
491 t5 = _mm_loadu_ps(ptrB+4);
492 t6 = _mm_loadu_ps(ptrB+8);
493 t4 = _mm_sub_ps(t4, z1);
494 t5 = _mm_sub_ps(t5, x3);
495 t6 = _mm_sub_ps(t6, y4);
496 _mm_storeu_ps(ptrB, t4);
497 _mm_storeu_ps(ptrB+4, t5);
498 _mm_storeu_ps(ptrB+8, t6);
499 t7 = _mm_loadu_ps(ptrC);
500 t8 = _mm_loadu_ps(ptrC+4);
501 t9 = _mm_loadu_ps(ptrC+8);
502 t7 = _mm_sub_ps(t7, t20);
503 t8 = _mm_sub_ps(t8, t22);
504 t9 = _mm_sub_ps(t9, t24);
505 _mm_storeu_ps(ptrC, t7);
506 _mm_storeu_ps(ptrC+4, t8);
507 _mm_storeu_ps(ptrC+8, t9);
508 t10 = _mm_loadu_ps(ptrD);
509 t11 = _mm_loadu_ps(ptrD+4);
510 t12 = _mm_loadu_ps(ptrD+8);
511 t10 = _mm_sub_ps(t10, t14);
512 t11 = _mm_sub_ps(t11, t16);
513 t12 = _mm_sub_ps(t12, t18);
514 _mm_storeu_ps(ptrD, t10);
515 _mm_storeu_ps(ptrD+4, t11);
516 _mm_storeu_ps(ptrD+8, t12);
520 static gmx_inline void gmx_simdcall
521 gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
522 float * gmx_restrict fptr,
523 float * gmx_restrict fshiftptr)
527 fix1 = _mm_hadd_ps(fix1, fix1);
528 fiy1 = _mm_hadd_ps(fiy1, fiz1);
530 fix1 = _mm_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 */
532 t2 = _mm_load_ss(fptr);
533 t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
534 t3 = _mm_load_ss(fshiftptr);
535 t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));
537 t2 = _mm_add_ps(t2, fix1);
538 t3 = _mm_add_ps(t3, fix1);
540 _mm_store_ss(fptr, t2);
541 _mm_storeh_pi((__m64 *)(fptr+1), t2);
542 _mm_store_ss(fshiftptr, t3);
543 _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
547 static gmx_inline void gmx_simdcall
548 gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
549 __m128 fix2, __m128 fiy2, __m128 fiz2,
550 __m128 fix3, __m128 fiy3, __m128 fiz3,
551 float * gmx_restrict fptr,
552 float * gmx_restrict fshiftptr)
554 __m128 t1, t2, t3, t4;
556 fix1 = _mm_hadd_ps(fix1, fiy1);
557 fiz1 = _mm_hadd_ps(fiz1, fix2);
558 fiy2 = _mm_hadd_ps(fiy2, fiz2);
559 fix3 = _mm_hadd_ps(fix3, fiy3);
560 fiz3 = _mm_hadd_ps(fiz3, fiz3);
562 fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
563 fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
564 fiz3 = _mm_hadd_ps(fiz3, fiz3); /* - - - fiz3 */
566 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
567 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
568 _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));
570 t4 = _mm_load_ss(fshiftptr+2);
571 t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));
573 t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1 - fiz3 */
574 t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3 - fiz1 */
575 t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
576 t3 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 2, 0, 0)); /* fiy2 fix2 - fiz2 */
578 t1 = _mm_add_ps(t1, t2);
579 t3 = _mm_add_ps(t3, t4);
580 t1 = _mm_add_ps(t1, t3); /* y x - z */
582 _mm_store_ss(fshiftptr+2, t1);
583 _mm_storeh_pi((__m64 *)(fshiftptr), t1);
587 static gmx_inline void gmx_simdcall
588 gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
589 __m128 fix2, __m128 fiy2, __m128 fiz2,
590 __m128 fix3, __m128 fiy3, __m128 fiz3,
591 __m128 fix4, __m128 fiy4, __m128 fiz4,
592 float * gmx_restrict fptr,
593 float * gmx_restrict fshiftptr)
595 __m128 t1, t2, t3, t4, t5;
597 fix1 = _mm_hadd_ps(fix1, fiy1);
598 fiz1 = _mm_hadd_ps(fiz1, fix2);
599 fiy2 = _mm_hadd_ps(fiy2, fiz2);
600 fix3 = _mm_hadd_ps(fix3, fiy3);
601 fiz3 = _mm_hadd_ps(fiz3, fix4);
602 fiy4 = _mm_hadd_ps(fiy4, fiz4);
604 fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
605 fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
606 fiz3 = _mm_hadd_ps(fiz3, fiy4); /* fiz4 fiy4 fix4 fiz3 */
608 _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) ));
609 _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
610 _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));
612 t5 = _mm_load_ss(fshiftptr+2);
613 t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));
615 t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2));
616 t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1));
617 t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0));
618 t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
619 t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));
621 t1 = _mm_add_ps(t1, t2);
622 t3 = _mm_add_ps(t3, t4);
623 t1 = _mm_add_ps(t1, t3);
624 t5 = _mm_add_ps(t5, t1);
626 _mm_store_ss(fshiftptr+2, t5);
627 _mm_storeh_pi((__m64 *)(fshiftptr), t5);
631 static gmx_inline void gmx_simdcall
632 gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
634 pot1 = _mm_hadd_ps(pot1, pot1);
635 pot1 = _mm_hadd_ps(pot1, pot1);
636 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
639 static gmx_inline void gmx_simdcall
640 gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
641 __m128 pot2, float * gmx_restrict ptrB)
643 pot1 = _mm_hadd_ps(pot1, pot2);
644 pot1 = _mm_hadd_ps(pot1, pot1);
645 pot2 = _mm_permute_ps(pot1, _MM_SHUFFLE(0, 0, 0, 1));
646 _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
647 _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
651 #endif /* _kernelutil_x86_avx_128_fma_single_h_ */