2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_x86_avx_256_single_h_
30 #define _kernelutil_x86_avx_256_single_h_
32 #include "gmx_x86_avx_256.h"
34 /* Transpose lower/upper half of 256-bit registers separately */
35 #define GMX_MM256_HALFTRANSPOSE4_PS(ymm0,ymm1,ymm2,ymm3) { \
36 __m256 __tmp0,__tmp1,__tmp2,__tmp3; \
38 __tmp0 = _mm256_unpacklo_ps((ymm0),(ymm1)); \
39 __tmp1 = _mm256_unpacklo_ps((ymm2),(ymm3)); \
40 __tmp2 = _mm256_unpackhi_ps((ymm0),(ymm1)); \
41 __tmp3 = _mm256_unpackhi_ps((ymm2),(ymm3)); \
42 ymm0 = _mm256_shuffle_ps(__tmp0,__tmp1,_MM_SHUFFLE(1,0,1,0)); \
43 ymm1 = _mm256_shuffle_ps(__tmp0,__tmp1,_MM_SHUFFLE(3,2,3,2)); \
44 ymm2 = _mm256_shuffle_ps(__tmp2,__tmp3,_MM_SHUFFLE(1,0,1,0)); \
45 ymm3 = _mm256_shuffle_ps(__tmp2,__tmp3,_MM_SHUFFLE(3,2,3,2)); \
49 static gmx_inline __m256
50 gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
52 return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx,dx), _mm256_mul_ps(dy,dy) ), _mm256_mul_ps(dz,dz) );
55 /* Normal sum of four ymm registers */
56 #define gmx_mm256_sum4_ps(t0,t1,t2,t3) _mm256_add_ps(_mm256_add_ps(t0,t1),_mm256_add_ps(t2,t3))
60 gmx_mm256_any_lt(__m256 a, __m256 b)
62 return _mm256_movemask_ps(_mm256_cmp_ps(a,b,_CMP_LT_OQ));
66 static gmx_inline __m256
67 gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
68 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
72 t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA),_mm_load_ss(ptrC));
73 t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB),_mm_load_ss(ptrD));
74 return _mm256_castps128_ps256(_mm_unpacklo_ps(t1,t2));
78 static gmx_inline __m256
79 gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
80 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
81 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
82 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH)
86 t1 = gmx_mm256_load_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD);
87 t2 = gmx_mm256_load_4real_swizzle_ps(ptrE,ptrF,ptrG,ptrH);
89 return _mm256_permute2f128_ps(t1,t2,0x20);
94 static gmx_inline void
95 gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
96 float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
100 t2 = _mm256_permute_ps(xmm1,_MM_SHUFFLE(1,1,1,1));
101 t3 = _mm256_permute_ps(xmm1,_MM_SHUFFLE(2,2,2,2));
102 t4 = _mm256_permute_ps(xmm1,_MM_SHUFFLE(3,3,3,3));
103 _mm_store_ss(ptrA,_mm256_castps256_ps128(xmm1));
104 _mm_store_ss(ptrB,_mm256_castps256_ps128(t2));
105 _mm_store_ss(ptrC,_mm256_castps256_ps128(t3));
106 _mm_store_ss(ptrD,_mm256_castps256_ps128(t4));
110 static gmx_inline void
111 gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
112 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
113 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
114 float * gmx_restrict ptrG, float * gmx_restrict ptrH, __m256 xmm1)
118 t1 = _mm256_permute2f128_ps(xmm1,xmm1,0x11);
120 gmx_mm256_store_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,xmm1);
121 gmx_mm256_store_4real_swizzle_ps(ptrE,ptrF,ptrG,ptrH,t1);
125 static gmx_inline void
126 gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
127 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
132 t1 = _mm256_castps256_ps128(xmm1);
133 t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
134 t3 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
135 t4 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
137 t1 = _mm_add_ss(t1,_mm_load_ss(ptrA));
138 t2 = _mm_add_ss(t2,_mm_load_ss(ptrB));
139 t3 = _mm_add_ss(t3,_mm_load_ss(ptrC));
140 t4 = _mm_add_ss(t4,_mm_load_ss(ptrD));
142 _mm_store_ss(ptrA,t1);
143 _mm_store_ss(ptrB,t2);
144 _mm_store_ss(ptrC,t3);
145 _mm_store_ss(ptrD,t4);
148 static gmx_inline void
149 gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
150 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
151 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
152 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
157 t1 = _mm256_permute2f128_ps(xmm1,xmm1,0x11);
159 gmx_mm256_increment_4real_swizzle_ps(ptrA,ptrB,ptrC,ptrD,xmm1);
160 gmx_mm256_increment_4real_swizzle_ps(ptrE,ptrF,ptrG,ptrH,t1);
164 static gmx_inline void
165 gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
166 const float * gmx_restrict p3, const float * gmx_restrict p4,
167 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
171 t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p1); /* - - c12a c6a */
172 t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p2); /* - - c12b c6b */
173 t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p3); /* - - c12c c6c */
174 t4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)p4); /* - - c12d c6d */
176 t1 = _mm_unpacklo_ps(t1,t2); /* c12b c12a c6b c6a */
177 t3 = _mm_unpacklo_ps(t3,t4); /* c12d c12c c6d c6c */
179 *c6 = _mm256_castps128_ps256(_mm_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)));
180 *c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)));
183 static gmx_inline void
184 gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
185 const float * gmx_restrict p3, const float * gmx_restrict p4,
186 const float * gmx_restrict p5, const float * gmx_restrict p6,
187 const float * gmx_restrict p7, const float * gmx_restrict p8,
188 __m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
190 __m256 c6l,c6h,c12l,c12h;
192 gmx_mm256_load_4pair_swizzle_ps(p1,p2,p3,p4,&c6l,&c12l);
193 gmx_mm256_load_4pair_swizzle_ps(p5,p6,p7,p8,&c6h,&c12h);
195 *c6 = _mm256_permute2f128_ps(c6l,c6h,0x20);
196 *c12 = _mm256_permute2f128_ps(c12l,c12h,0x20);
200 static gmx_inline void
201 gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
202 const float * gmx_restrict xyz,
203 __m256 * gmx_restrict x1,
204 __m256 * gmx_restrict y1,
205 __m256 * gmx_restrict z1)
209 t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
210 t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz);
211 t3 = _mm_load_ss(xyz_shift+2);
212 t4 = _mm_load_ss(xyz+2);
213 t1 = _mm_add_ps(t1,t2);
214 t3 = _mm_add_ss(t3,t4);
216 t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
217 t1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
218 t3 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
220 *x1 = gmx_mm256_set_m128(t1,t1);
221 *y1 = gmx_mm256_set_m128(t2,t2);
222 *z1 = gmx_mm256_set_m128(t3,t3);
226 static gmx_inline void
227 gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
228 const float * gmx_restrict xyz,
229 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
230 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
231 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
234 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9;
236 tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
237 tB = _mm_load_ss(xyz_shift+2);
239 t1 = _mm_loadu_ps(xyz);
240 t2 = _mm_loadu_ps(xyz+4);
241 t3 = _mm_load_ss(xyz+8);
243 tA = _mm_movelh_ps(tA,tB);
244 t4 = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
245 t5 = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
246 t6 = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
248 t1 = _mm_add_ps(t1,t4);
249 t2 = _mm_add_ps(t2,t5);
250 t3 = _mm_add_ss(t3,t6);
252 t9 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
253 t8 = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
254 t7 = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
255 t6 = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
256 t5 = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
257 t4 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
258 t3 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
259 t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
260 t1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
262 *x1 = gmx_mm256_set_m128(t1,t1);
263 *y1 = gmx_mm256_set_m128(t2,t2);
264 *z1 = gmx_mm256_set_m128(t3,t3);
265 *x2 = gmx_mm256_set_m128(t4,t4);
266 *y2 = gmx_mm256_set_m128(t5,t5);
267 *z2 = gmx_mm256_set_m128(t6,t6);
268 *x3 = gmx_mm256_set_m128(t7,t7);
269 *y3 = gmx_mm256_set_m128(t8,t8);
270 *z3 = gmx_mm256_set_m128(t9,t9);
274 static gmx_inline void
275 gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
276 const float * gmx_restrict xyz,
277 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
278 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
279 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
280 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
283 __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
285 tA = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)xyz_shift);
286 tB = _mm_load_ss(xyz_shift+2);
288 t1 = _mm_loadu_ps(xyz);
289 t2 = _mm_loadu_ps(xyz+4);
290 t3 = _mm_loadu_ps(xyz+8);
292 tA = _mm_movelh_ps(tA,tB);
293 t4 = _mm_permute_ps(tA,_MM_SHUFFLE(0,2,1,0));
294 t5 = _mm_permute_ps(tA,_MM_SHUFFLE(1,0,2,1));
295 t6 = _mm_permute_ps(tA,_MM_SHUFFLE(2,1,0,2));
297 t1 = _mm_add_ps(t1,t4);
298 t2 = _mm_add_ps(t2,t5);
299 t3 = _mm_add_ps(t3,t6);
301 t12 = _mm_permute_ps(t3,_MM_SHUFFLE(3,3,3,3));
302 t11 = _mm_permute_ps(t3,_MM_SHUFFLE(2,2,2,2));
303 t10 = _mm_permute_ps(t3,_MM_SHUFFLE(1,1,1,1));
304 t9 = _mm_permute_ps(t3,_MM_SHUFFLE(0,0,0,0));
305 t8 = _mm_permute_ps(t2,_MM_SHUFFLE(3,3,3,3));
306 t7 = _mm_permute_ps(t2,_MM_SHUFFLE(2,2,2,2));
307 t6 = _mm_permute_ps(t2,_MM_SHUFFLE(1,1,1,1));
308 t5 = _mm_permute_ps(t2,_MM_SHUFFLE(0,0,0,0));
309 t4 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
310 t3 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
311 t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
312 t1 = _mm_permute_ps(t1,_MM_SHUFFLE(0,0,0,0));
314 *x1 = gmx_mm256_set_m128(t1,t1);
315 *y1 = gmx_mm256_set_m128(t2,t2);
316 *z1 = gmx_mm256_set_m128(t3,t3);
317 *x2 = gmx_mm256_set_m128(t4,t4);
318 *y2 = gmx_mm256_set_m128(t5,t5);
319 *z2 = gmx_mm256_set_m128(t6,t6);
320 *x3 = gmx_mm256_set_m128(t7,t7);
321 *y3 = gmx_mm256_set_m128(t8,t8);
322 *z3 = gmx_mm256_set_m128(t9,t9);
323 *x4 = gmx_mm256_set_m128(t10,t10);
324 *y4 = gmx_mm256_set_m128(t11,t11);
325 *z4 = gmx_mm256_set_m128(t12,t12);
330 static gmx_inline void
331 gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
332 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
333 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
336 __m128i mask = _mm_set_epi32(0,-1,-1,-1);
337 t1 = gmx_mm_maskload_ps(ptrA,mask);
338 t2 = gmx_mm_maskload_ps(ptrB,mask);
339 t3 = gmx_mm_maskload_ps(ptrC,mask);
340 t4 = gmx_mm_maskload_ps(ptrD,mask);
341 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
342 *x1 = _mm256_castps128_ps256(t1);
343 *y1 = _mm256_castps128_ps256(t2);
344 *z1 = _mm256_castps128_ps256(t3);
348 static gmx_inline void
349 gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
350 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
351 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
352 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
353 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
356 t1 = _mm_loadu_ps(ptrA);
357 t2 = _mm_loadu_ps(ptrB);
358 t3 = _mm_loadu_ps(ptrC);
359 t4 = _mm_loadu_ps(ptrD);
360 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
361 *x1 = _mm256_castps128_ps256(t1);
362 *y1 = _mm256_castps128_ps256(t2);
363 *z1 = _mm256_castps128_ps256(t3);
364 *x2 = _mm256_castps128_ps256(t4);
365 t1 = _mm_loadu_ps(ptrA+4);
366 t2 = _mm_loadu_ps(ptrB+4);
367 t3 = _mm_loadu_ps(ptrC+4);
368 t4 = _mm_loadu_ps(ptrD+4);
369 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
370 *y2 = _mm256_castps128_ps256(t1);
371 *z2 = _mm256_castps128_ps256(t2);
372 *x3 = _mm256_castps128_ps256(t3);
373 *y3 = _mm256_castps128_ps256(t4);
374 t1 = _mm_load_ss(ptrA+8);
375 t2 = _mm_load_ss(ptrB+8);
376 t3 = _mm_load_ss(ptrC+8);
377 t4 = _mm_load_ss(ptrD+8);
378 t1 = _mm_unpacklo_ps(t1,t3);
379 t3 = _mm_unpacklo_ps(t2,t4);
380 *z3 = _mm256_castps128_ps256(_mm_unpacklo_ps(t1,t3));
385 static gmx_inline void
386 gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
387 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
388 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
389 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
390 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
391 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
394 t1 = _mm_loadu_ps(ptrA);
395 t2 = _mm_loadu_ps(ptrB);
396 t3 = _mm_loadu_ps(ptrC);
397 t4 = _mm_loadu_ps(ptrD);
398 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
399 *x1 = _mm256_castps128_ps256(t1);
400 *y1 = _mm256_castps128_ps256(t2);
401 *z1 = _mm256_castps128_ps256(t3);
402 *x2 = _mm256_castps128_ps256(t4);
403 t1 = _mm_loadu_ps(ptrA+4);
404 t2 = _mm_loadu_ps(ptrB+4);
405 t3 = _mm_loadu_ps(ptrC+4);
406 t4 = _mm_loadu_ps(ptrD+4);
407 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
408 *y2 = _mm256_castps128_ps256(t1);
409 *z2 = _mm256_castps128_ps256(t2);
410 *x3 = _mm256_castps128_ps256(t3);
411 *y3 = _mm256_castps128_ps256(t4);
412 t1 = _mm_loadu_ps(ptrA+8);
413 t2 = _mm_loadu_ps(ptrB+8);
414 t3 = _mm_loadu_ps(ptrC+8);
415 t4 = _mm_loadu_ps(ptrD+8);
416 _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
417 *z3 = _mm256_castps128_ps256(t1);
418 *x4 = _mm256_castps128_ps256(t2);
419 *y4 = _mm256_castps128_ps256(t3);
420 *z4 = _mm256_castps128_ps256(t4);
424 static gmx_inline void
425 gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
426 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
427 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
428 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
429 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
431 __m256 t1,t2,t3,t4,t5,t6,t7,t8;
432 __m128i mask = _mm_set_epi32(0,-1,-1,-1);
434 t1 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /* - zE yE xE | - zA yA xA */
435 t2 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /* - zF yF xF | - zB yB xB */
436 t3 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /* - zG yG xG | - zC yC xC */
437 t4 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /* - zH yH xH | - zD yD xD */
439 t5 = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
440 t6 = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
441 t7 = _mm256_unpackhi_ps(t1,t2); /* - - zF zE | - - zB zA */
442 t8 = _mm256_unpackhi_ps(t3,t4); /* - - zH zG | - - zD zC */
444 *x1 = _mm256_shuffle_ps(t5,t6,_MM_SHUFFLE(1,0,1,0));
445 *y1 = _mm256_shuffle_ps(t5,t6,_MM_SHUFFLE(3,2,3,2));
446 *z1 = _mm256_shuffle_ps(t7,t8,_MM_SHUFFLE(1,0,1,0));
450 static gmx_inline void
451 gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
452 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
453 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
454 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
455 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
456 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
457 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3)
459 __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
461 t1 = _mm256_loadu_ps(ptrA); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
462 t2 = _mm256_loadu_ps(ptrB); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
463 t3 = _mm256_loadu_ps(ptrC); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
464 t4 = _mm256_loadu_ps(ptrD); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
465 t5 = _mm256_loadu_ps(ptrE); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
466 t6 = _mm256_loadu_ps(ptrF); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
467 t7 = _mm256_loadu_ps(ptrG); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
468 t8 = _mm256_loadu_ps(ptrH); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
470 t9 = _mm256_unpacklo_ps(t1,t2); /* z2b z2a y2b y2a | y1b y1a x1b x1a */
471 t10 = _mm256_unpackhi_ps(t1,t2); /* y3b y3a x3b x3a | x2b x2a z1b z1a */
472 t11 = _mm256_unpacklo_ps(t3,t4); /* z2d z2c y2d y2c | y1d y1c x1d x1c */
473 t12 = _mm256_unpackhi_ps(t3,t4); /* y3d y3c x3d x3c | x2d x2c z1d z1c */
474 t1 = _mm256_unpacklo_ps(t5,t6); /* z2f z2e y2f y2e | y1f y1e x1f x1e */
475 t2 = _mm256_unpackhi_ps(t5,t6); /* y3f y3e x3f x3e | x2f x2e z1f z1e */
476 t3 = _mm256_unpacklo_ps(t7,t8); /* z2h z2g y2h y2g | y1h y1g x1h x1g */
477 t4 = _mm256_unpackhi_ps(t7,t8); /* y3h y3g x3h x3g | x2h x2g z1h z1g */
479 t5 = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(1,0,1,0)); /* y2d y2c y2b y2a | x1d x1c x1b x1a */
480 t6 = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(3,2,3,2)); /* z2d z2c z2b z2a | y1d y1c y1b y1a */
481 t7 = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(1,0,1,0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
482 t8 = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(3,2,3,2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
484 t9 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* y2h y2g y2f y2e | x1h x1g x1f x1e */
485 t10 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* z2h z2g z2f z2e | y1h y1g y1f y1e */
486 t11 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* x3h x3g x3f x3e | z1h z1g z1f z1e */
487 t12 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* y3h y3g y3f y3e | x2h x2g x2f x2e */
489 *x1 = _mm256_permute2f128_ps(t5, t9, 0x20);
490 *y1 = _mm256_permute2f128_ps(t6, t10, 0x20);
491 *z1 = _mm256_permute2f128_ps(t7, t11, 0x20);
492 *x2 = _mm256_permute2f128_ps(t8, t12, 0x20);
494 *y2 = _mm256_permute2f128_ps(t5, t9, 0x31);
495 *z2 = _mm256_permute2f128_ps(t6, t10, 0x31);
496 *x3 = _mm256_permute2f128_ps(t7, t11, 0x31);
497 *y3 = _mm256_permute2f128_ps(t8, t12, 0x31);
499 t1 = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
500 t2 = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
501 t3 = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
502 t4 = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
504 t1 = _mm256_unpacklo_ps(t1,t3); /* - - z3g z3e | - - z3c z3a */
505 t2 = _mm256_unpacklo_ps(t2,t4); /* - - z3h z3f | - - z3d z3b */
507 *z3 = _mm256_unpacklo_ps(t1,t2);
512 static gmx_inline void
513 gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
514 const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
515 const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
516 const float * gmx_restrict ptrG, const float * gmx_restrict ptrH,
517 __m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
518 __m256 * gmx_restrict x2, __m256 * gmx_restrict y2, __m256 * gmx_restrict z2,
519 __m256 * gmx_restrict x3, __m256 * gmx_restrict y3, __m256 * gmx_restrict z3,
520 __m256 * gmx_restrict x4, __m256 * gmx_restrict y4, __m256 * gmx_restrict z4)
522 __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
524 t1 = _mm256_loadu_ps(ptrA); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
525 t2 = _mm256_loadu_ps(ptrB); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
526 t3 = _mm256_loadu_ps(ptrC); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
527 t4 = _mm256_loadu_ps(ptrD); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
528 t5 = _mm256_loadu_ps(ptrE); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
529 t6 = _mm256_loadu_ps(ptrF); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
530 t7 = _mm256_loadu_ps(ptrG); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
531 t8 = _mm256_loadu_ps(ptrH); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
533 t9 = _mm256_unpacklo_ps(t1,t2); /* z2b z2a y2b y2a | y1b y1a x1b x1a */
534 t10 = _mm256_unpackhi_ps(t1,t2); /* y3b y3a x3b x3a | x2b x2a z1b z1a */
535 t11 = _mm256_unpacklo_ps(t3,t4); /* z2d z2c y2d y2c | y1d y1c x1d x1c */
536 t12 = _mm256_unpackhi_ps(t3,t4); /* y3d y3c x3d x3c | x2d x2c z1d z1c */
537 t1 = _mm256_unpacklo_ps(t5,t6); /* z2f z2e y2f y2e | y1f y1e x1f x1e */
538 t2 = _mm256_unpackhi_ps(t5,t6); /* y3f y3e x3f x3e | x2f x2e z1f z1e */
539 t3 = _mm256_unpacklo_ps(t7,t8); /* z2h z2g y2h y2g | y1h y1g x1h x1g */
540 t4 = _mm256_unpackhi_ps(t7,t8); /* y3h y3g x3h x3g | x2h x2g z1h z1g */
542 t5 = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(1,0,1,0)); /* y2d y2c y2b y2a | x1d x1c x1b x1a */
543 t6 = _mm256_shuffle_ps(t9,t11,_MM_SHUFFLE(3,2,3,2)); /* z2d z2c z2b z2a | y1d y1c y1b y1a */
544 t7 = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(1,0,1,0)); /* x3d x3c x3b x3a | z1d z1c z1b z1a */
545 t8 = _mm256_shuffle_ps(t10,t12,_MM_SHUFFLE(3,2,3,2)); /* y3d y3c y3b y3a | x2d x2c x2b x2a */
546 t9 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* y2h y2g y2f y2e | x1h x1g x1f x1e */
547 t10 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* z2h z2g z2f z2e | y1h y1g y1f y1e */
548 t11 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* x3h x3g x3f x3e | z1h z1g z1f z1e */
549 t12 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* y3h y3g y3f y3e | x2h x2g x2f x2e */
551 *x1 = _mm256_permute2f128_ps(t5, t9, 0x20);
552 *y1 = _mm256_permute2f128_ps(t6, t10, 0x20);
553 *z1 = _mm256_permute2f128_ps(t7, t11, 0x20);
554 *x2 = _mm256_permute2f128_ps(t8, t12, 0x20);
556 *y2 = _mm256_permute2f128_ps(t5, t9, 0x31);
557 *z2 = _mm256_permute2f128_ps(t6, t10, 0x31);
558 *x3 = _mm256_permute2f128_ps(t7, t11, 0x31);
559 *y3 = _mm256_permute2f128_ps(t8, t12, 0x31);
561 t1 = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
562 t2 = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
563 t3 = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
564 t4 = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
566 t5 = _mm256_unpacklo_ps(t1,t2); /* x4f x4e z3f z3e | x4b x4a z3b z3a */
567 t6 = _mm256_unpackhi_ps(t1,t2); /* z4f z4e y4f y4e | z4b z4a y4b y4a */
568 t7 = _mm256_unpacklo_ps(t3,t4); /* x4h x4g z3h z3g | x4d x4c z3d z3c */
569 t8 = _mm256_unpackhi_ps(t3,t4); /* z4h z4g y4h y4g | z4d z4c y4d y4c */
571 *z3 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* z3h z3g z3f z3e | z3d z3c z3b z3a */
572 *x4 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* x4h x4g x4f x4e | x4d x4c x4b x4a */
573 *y4 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y4h y4g y4f y4e | y4d y4c y4b y4a */
574 *z4 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(3,2,3,2)); /* z4h z4g z4f z4e | z4d z4c z4b z4a */
578 static gmx_inline void
579 gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
580 float * gmx_restrict ptrC,float * gmx_restrict ptrD,
581 __m256 x1, __m256 y1, __m256 z1)
583 __m128 t1,t2,t3,t4,t5,t6,t7,t8;
586 /* Construct a mask without executing any data loads */
587 mask = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
589 t3 = _mm_unpacklo_ps(_mm256_castps256_ps128(x1),_mm256_castps256_ps128(y1)); /* y1b x1b y1a x1a */
590 t4 = _mm_unpackhi_ps(_mm256_castps256_ps128(x1),_mm256_castps256_ps128(y1)); /* y1d x1d y1c x1c */
592 t1 = _mm_shuffle_ps(t3,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,0,1,0)); /* - z1a y1a x1a */
593 t2 = _mm_shuffle_ps(t3,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,1,3,2)); /* - z1b y1b x1b */
594 t3 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /* - z1c y1c x1c */
595 t4 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /* - z1d y1d x1d */
597 t5 = gmx_mm_maskload_ps(ptrA,mask);
598 t6 = gmx_mm_maskload_ps(ptrB,mask);
599 t7 = gmx_mm_maskload_ps(ptrC,mask);
600 t8 = gmx_mm_maskload_ps(ptrD,mask);
602 t5 = _mm_sub_ps(t5,t1);
603 t6 = _mm_sub_ps(t6,t2);
604 t7 = _mm_sub_ps(t7,t3);
605 t8 = _mm_sub_ps(t8,t4);
607 gmx_mm_maskstore_ps(ptrA,mask,t5);
608 gmx_mm_maskstore_ps(ptrB,mask,t6);
609 gmx_mm_maskstore_ps(ptrC,mask,t7);
610 gmx_mm_maskstore_ps(ptrD,mask,t8);
615 static gmx_inline void
616 gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
617 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
618 __m256 x1, __m256 y1, __m256 z1,
619 __m256 x2, __m256 y2, __m256 z2,
620 __m256 x3, __m256 y3, __m256 z3)
622 __m256 t1,t2,t3,t4,t5,t6;
625 t1 = _mm256_loadu_ps(ptrA);
626 t2 = _mm256_loadu_ps(ptrB);
627 t3 = _mm256_loadu_ps(ptrC);
628 t4 = _mm256_loadu_ps(ptrD);
629 tA = _mm_load_ss(ptrA+8);
630 tB = _mm_load_ss(ptrB+8);
631 tC = _mm_load_ss(ptrC+8);
632 tD = _mm_load_ss(ptrD+8);
634 t5 = _mm256_unpacklo_ps(x1,y1); /* - - - - | y1b x1b y1a x1a */
635 x1 = _mm256_unpackhi_ps(x1,y1); /* - - - - | y1d x1d y1c x1c */
636 y1 = _mm256_unpacklo_ps(z1,x2); /* - - - - | x2b z1b x2a z1a */
637 z1 = _mm256_unpackhi_ps(z1,x2); /* - - - - | x2d z1d x2c z1c */
639 x2 = _mm256_unpacklo_ps(y2,z2); /* - - - - | z2b y2b z2a y2a */
640 y2 = _mm256_unpackhi_ps(y2,z2); /* - - - - | z2d y2d z2c y2c */
641 t6 = _mm256_unpacklo_ps(x3,y3); /* - - - - | y3b x3b y3a x3a */
642 x3 = _mm256_unpackhi_ps(x3,y3); /* - - - - | y3d x3d y3c x3c */
644 t5 = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
645 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
647 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(t6), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
648 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
650 z2 = _mm256_shuffle_ps(t5,y1,_MM_SHUFFLE(1,0,1,0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
651 t5 = _mm256_shuffle_ps(t5,y1,_MM_SHUFFLE(3,2,3,2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
652 y1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
653 x1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
655 t1 = _mm256_sub_ps(t1,z2);
656 t2 = _mm256_sub_ps(t2,t5);
657 t3 = _mm256_sub_ps(t3,y1);
658 t4 = _mm256_sub_ps(t4,x1);
660 tA = _mm_sub_ss(tA, _mm256_castps256_ps128(z3));
661 tB = _mm_sub_ss(tB, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(1,1,1,1)));
662 tC = _mm_sub_ss(tC, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(2,2,2,2)));
663 tD = _mm_sub_ss(tD, _mm_permute_ps(_mm256_castps256_ps128(z3),_MM_SHUFFLE(3,3,3,3)));
665 /* Here we store a full 256-bit value and a separate 32-bit one; no overlap can happen */
666 _mm256_storeu_ps(ptrA,t1);
667 _mm256_storeu_ps(ptrB,t2);
668 _mm256_storeu_ps(ptrC,t3);
669 _mm256_storeu_ps(ptrD,t4);
670 _mm_store_ss(ptrA+8,tA);
671 _mm_store_ss(ptrB+8,tB);
672 _mm_store_ss(ptrC+8,tC);
673 _mm_store_ss(ptrD+8,tD);
677 static gmx_inline void
678 gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
679 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
680 __m256 x1, __m256 y1, __m256 z1,
681 __m256 x2, __m256 y2, __m256 z2,
682 __m256 x3, __m256 y3, __m256 z3,
683 __m256 x4, __m256 y4, __m256 z4)
685 __m256 t1,t2,t3,t4,t5;
686 __m128 tA,tB,tC,tD,tE,tF,tG,tH;
688 t1 = _mm256_loadu_ps(ptrA);
689 t2 = _mm256_loadu_ps(ptrB);
690 t3 = _mm256_loadu_ps(ptrC);
691 t4 = _mm256_loadu_ps(ptrD);
692 tA = _mm_loadu_ps(ptrA+8);
693 tB = _mm_loadu_ps(ptrB+8);
694 tC = _mm_loadu_ps(ptrC+8);
695 tD = _mm_loadu_ps(ptrD+8);
697 t5 = _mm256_unpacklo_ps(x1,y1); /* - - - - | y1b x1b y1a x1a */
698 x1 = _mm256_unpackhi_ps(x1,y1); /* - - - - | y1d x1d y1c x1c */
699 y1 = _mm256_unpacklo_ps(z1,x2); /* - - - - | x2b z1b x2a z1a */
700 z1 = _mm256_unpackhi_ps(z1,x2); /* - - - - | x2d z1d x2c z1c */
702 x2 = _mm256_unpacklo_ps(y2,z2); /* - - - - | z2b y2b z2a y2a */
703 y2 = _mm256_unpackhi_ps(y2,z2); /* - - - - | z2d y2d z2c y2c */
704 z2 = _mm256_unpacklo_ps(x3,y3); /* - - - - | y3b x3b y3a x3a */
705 x3 = _mm256_unpackhi_ps(x3,y3); /* - - - - | y3d x3d y3c x3c */
707 y3 = _mm256_unpacklo_ps(z3,x4); /* - - - - | x4b z3b x4a z3a */
708 z3 = _mm256_unpackhi_ps(z3,x4); /* - - - - | x4d z3d x4c z3c */
709 x4 = _mm256_unpacklo_ps(y4,z4); /* - - - - | z4b y4b z4a y4a */
710 y4 = _mm256_unpackhi_ps(y4,z4); /* - - - - | z4d y4d z4c y4c */
712 x2 = _mm256_insertf128_ps(t5, _mm256_castps256_ps128(x2), 0x1); /* z2b y2b z2a y2a | y1b x1b y1a x1a */
713 x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); /* z2d y2d z2c y2c | y1d x1d y1c x1c */
714 y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); /* y3b x3b y3a x3a | x2b z1b x2a z1a */
715 z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); /* y3d x3d y3c x3c | x2d z1d x2c z1c */
717 z2 = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(1,0,1,0)); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
718 t5 = _mm256_shuffle_ps(x2,y1,_MM_SHUFFLE(3,2,3,2)); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
719 y1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(1,0,1,0)); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
720 x1 = _mm256_shuffle_ps(x1,z1,_MM_SHUFFLE(3,2,3,2)); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
722 tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(1,0,1,0)); /* z4a y4a x4a z3a */
723 tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3),_mm256_castps256_ps128(x4),_MM_SHUFFLE(3,2,3,2)); /* z4b y4b x4b z3b */
725 tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(1,0,1,0)); /* z4c y4c x4c z3c */
726 tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3),_mm256_castps256_ps128(y4),_MM_SHUFFLE(3,2,3,2)); /* z4d y4d x4d z3d */
728 t1 = _mm256_sub_ps(t1,z2);
729 t2 = _mm256_sub_ps(t2,t5);
730 t3 = _mm256_sub_ps(t3,y1);
731 t4 = _mm256_sub_ps(t4,x1);
733 tA = _mm_sub_ps(tA,tE);
734 tB = _mm_sub_ps(tB,tF);
735 tC = _mm_sub_ps(tC,tG);
736 tD = _mm_sub_ps(tD,tH);
738 /* Here we store a full 256-bit value and a separate 128-bit one; no overlap can happen */
739 _mm256_storeu_ps(ptrA,t1);
740 _mm256_storeu_ps(ptrB,t2);
741 _mm256_storeu_ps(ptrC,t3);
742 _mm256_storeu_ps(ptrD,t4);
743 _mm_storeu_ps(ptrA+8,tA);
744 _mm_storeu_ps(ptrB+8,tB);
745 _mm_storeu_ps(ptrC+8,tC);
746 _mm_storeu_ps(ptrD+8,tD);
751 static gmx_inline void
752 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
753 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
754 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
755 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
756 __m256 x1, __m256 y1, __m256 z1)
758 __m256 t1,t2,t3,t4,t5,t6;
762 /* Construct a mask without executing any data loads */
763 mask = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
765 tA = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
766 tB = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
767 tC = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
768 tD = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
769 t1 = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
770 t2 = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
772 t3 = _mm256_shuffle_ps(t1,z1,_MM_SHUFFLE(0,0,1,0)); /* - z1e y1e x1e | - z1a y1a x1a */
773 t4 = _mm256_shuffle_ps(t1,z1,_MM_SHUFFLE(0,1,3,2)); /* - z1f y1f x1f | - z1b y1b x1b */
774 t5 = _mm256_shuffle_ps(t2,z1,_MM_SHUFFLE(0,2,1,0)); /* - z1g y1g x1g | - z1c y1c x1c */
775 t6 = _mm256_shuffle_ps(t2,z1,_MM_SHUFFLE(0,3,3,2)); /* - z1h y1h x1h | - z1d y1d x1d */
777 tA = _mm256_sub_ps(tA,t3);
778 tB = _mm256_sub_ps(tB,t4);
779 tC = _mm256_sub_ps(tC,t5);
780 tD = _mm256_sub_ps(tD,t6);
782 gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
783 gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
784 gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
785 gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
786 gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
787 gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
788 gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
789 gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
794 static gmx_inline void
795 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
796 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
797 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
798 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
799 __m256 x1, __m256 y1, __m256 z1,
800 __m256 x2, __m256 y2, __m256 z2,
801 __m256 x3, __m256 y3, __m256 z3)
803 __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
804 __m256 tA,tB,tC,tD,tE,tF,tG,tH;
807 tA = _mm256_loadu_ps(ptrA);
808 tB = _mm256_loadu_ps(ptrB);
809 tC = _mm256_loadu_ps(ptrC);
810 tD = _mm256_loadu_ps(ptrD);
811 tE = _mm256_loadu_ps(ptrE);
812 tF = _mm256_loadu_ps(ptrF);
813 tG = _mm256_loadu_ps(ptrG);
814 tH = _mm256_loadu_ps(ptrH);
816 t1 = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
817 t2 = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
818 t3 = _mm256_unpacklo_ps(z1,x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
819 t4 = _mm256_unpackhi_ps(z1,x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
821 t5 = _mm256_unpacklo_ps(y2,z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
822 t6 = _mm256_unpackhi_ps(y2,z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
823 t7 = _mm256_unpacklo_ps(x3,y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
824 t8 = _mm256_unpackhi_ps(x3,y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
826 t9 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
827 t10 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
828 t11 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
829 t12 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
831 t1 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
832 t2 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
833 t3 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
834 t4 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(3,2,3,2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
836 t5 = gmx_mm256_unpack128lo_ps(t9,t1); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
837 t6 = gmx_mm256_unpack128hi_ps(t9,t1); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
838 t7 = gmx_mm256_unpack128lo_ps(t10,t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
839 t8 = gmx_mm256_unpack128hi_ps(t10,t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
840 t1 = gmx_mm256_unpack128lo_ps(t11,t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
841 t2 = gmx_mm256_unpack128hi_ps(t11,t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
842 t9 = gmx_mm256_unpack128lo_ps(t12,t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
843 t10 = gmx_mm256_unpack128hi_ps(t12,t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
845 tA = _mm256_sub_ps(tA,t5);
846 tB = _mm256_sub_ps(tB,t7);
847 tC = _mm256_sub_ps(tC,t1);
848 tD = _mm256_sub_ps(tD,t9);
849 tE = _mm256_sub_ps(tE,t6);
850 tF = _mm256_sub_ps(tF,t8);
851 tG = _mm256_sub_ps(tG,t2);
852 tH = _mm256_sub_ps(tH,t10);
854 _mm256_storeu_ps(ptrA,tA);
855 _mm256_storeu_ps(ptrB,tB);
856 _mm256_storeu_ps(ptrC,tC);
857 _mm256_storeu_ps(ptrD,tD);
858 _mm256_storeu_ps(ptrE,tE);
859 _mm256_storeu_ps(ptrF,tF);
860 _mm256_storeu_ps(ptrG,tG);
861 _mm256_storeu_ps(ptrH,tH);
863 tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8),_mm_load_ss(ptrA+8));
864 tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8),_mm_load_ss(ptrB+8));
865 tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8),_mm_load_ss(ptrC+8));
866 tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8),_mm_load_ss(ptrD+8));
868 tI = _mm256_unpacklo_ps(tI,tK); /* - - zG zE | - - zC zA */
869 tJ = _mm256_unpacklo_ps(tJ,tL); /* - - zH zF | - - zD zB */
870 tI = _mm256_unpacklo_ps(tI,tJ); /* zH zG zF zE | zD zC zB zA */
872 tI = _mm256_sub_ps(tI,z3);
873 tJ = _mm256_permute_ps(tI,_MM_SHUFFLE(1,1,1,1));
874 tK = _mm256_permute_ps(tI,_MM_SHUFFLE(2,2,2,2));
875 tL = _mm256_permute_ps(tI,_MM_SHUFFLE(3,3,3,3));
877 _mm_store_ss(ptrA+8,_mm256_castps256_ps128(tI));
878 _mm_store_ss(ptrB+8,_mm256_castps256_ps128(tJ));
879 _mm_store_ss(ptrC+8,_mm256_castps256_ps128(tK));
880 _mm_store_ss(ptrD+8,_mm256_castps256_ps128(tL));
881 _mm_store_ss(ptrE+8,_mm256_extractf128_ps(tI,0x1));
882 _mm_store_ss(ptrF+8,_mm256_extractf128_ps(tJ,0x1));
883 _mm_store_ss(ptrG+8,_mm256_extractf128_ps(tK,0x1));
884 _mm_store_ss(ptrH+8,_mm256_extractf128_ps(tL,0x1));
888 static gmx_inline void
889 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
890 float * gmx_restrict ptrC, float * gmx_restrict ptrD,
891 float * gmx_restrict ptrE, float * gmx_restrict ptrF,
892 float * gmx_restrict ptrG, float * gmx_restrict ptrH,
893 __m256 x1, __m256 y1, __m256 z1,
894 __m256 x2, __m256 y2, __m256 z2,
895 __m256 x3, __m256 y3, __m256 z3,
896 __m256 x4, __m256 y4, __m256 z4)
898 __m256 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
899 __m256 tA,tB,tC,tD,tE,tF,tG,tH;
902 tA = _mm256_loadu_ps(ptrA);
903 tB = _mm256_loadu_ps(ptrB);
904 tC = _mm256_loadu_ps(ptrC);
905 tD = _mm256_loadu_ps(ptrD);
906 tE = _mm256_loadu_ps(ptrE);
907 tF = _mm256_loadu_ps(ptrF);
908 tG = _mm256_loadu_ps(ptrG);
909 tH = _mm256_loadu_ps(ptrH);
911 t1 = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
912 t2 = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
913 t3 = _mm256_unpacklo_ps(z1,x2); /* x2f z1f x2e z1e | x2b z1b x2a z1a */
914 t4 = _mm256_unpackhi_ps(z1,x2); /* x2h z1h x2g z1g | x2d z1d x2c z1c */
916 t5 = _mm256_unpacklo_ps(y2,z2); /* z2f y2f z2e y2e | z2b y2b z2a y2a */
917 t6 = _mm256_unpackhi_ps(y2,z2); /* z2h y2h z2g y2g | z2d y2d z2c y2c */
918 t7 = _mm256_unpacklo_ps(x3,y3); /* y3f x3f y3e x3e | y3b x3b y3a x3a */
919 t8 = _mm256_unpackhi_ps(x3,y3); /* y3h x3h y3g x3g | y3d x3d y3c x3c */
921 t9 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* x2e z1e y1e x1e | x2a z1a y1a x1a */
922 t10 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* x2f z1f y1f x1f | x2b z1b y1b x1b */
923 t11 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* x2g z1g y1g x1g | x2c z1c y1c x1c */
924 t12 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* x2h z1h y1h x1h | x2d z1d y1d x1d */
926 t1 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); /* y3e x3e z2e y2e | y3a x3a z2a y2a */
927 t2 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); /* y3f x3f z2f y2f | y3b x3b z2b y2b */
928 t3 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(1,0,1,0)); /* y3g x3g z2g y2g | y3c x3c z2c y2c */
929 t4 = _mm256_shuffle_ps(t6,t8,_MM_SHUFFLE(3,2,3,2)); /* y3h x3h z2h y2h | y3d x3d z2d y2d */
931 t5 = gmx_mm256_unpack128lo_ps(t9,t1); /* y3a x3a z2a y2a | x2a z1a y1a x1a */
932 t6 = gmx_mm256_unpack128hi_ps(t9,t1); /* y3e x3e z2e y2e | x2e z1e y1e x1e */
933 t7 = gmx_mm256_unpack128lo_ps(t10,t2); /* y3b x3b z2b y2b | x2b z1b y1b x1b */
934 t8 = gmx_mm256_unpack128hi_ps(t10,t2); /* y3f x3f z2f y2f | x2f z1f y1f x1f */
935 t1 = gmx_mm256_unpack128lo_ps(t11,t3); /* y3c x3c z2c y2c | x2c z1c y1c x1c */
936 t2 = gmx_mm256_unpack128hi_ps(t11,t3); /* y3g x3g z2g y2g | x2g z1g y1g x1g */
937 t9 = gmx_mm256_unpack128lo_ps(t12,t4); /* y3d x3d z2d y2d | x2d z1d y1d x1d */
938 t10 = gmx_mm256_unpack128hi_ps(t12,t4); /* y3h x3h z2h y2h | x2h z1h y1h x1h */
940 tA = _mm256_sub_ps(tA,t5);
941 tB = _mm256_sub_ps(tB,t7);
942 tC = _mm256_sub_ps(tC,t1);
943 tD = _mm256_sub_ps(tD,t9);
944 tE = _mm256_sub_ps(tE,t6);
945 tF = _mm256_sub_ps(tF,t8);
946 tG = _mm256_sub_ps(tG,t2);
947 tH = _mm256_sub_ps(tH,t10);
949 _mm256_storeu_ps(ptrA,tA);
950 _mm256_storeu_ps(ptrB,tB);
951 _mm256_storeu_ps(ptrC,tC);
952 _mm256_storeu_ps(ptrD,tD);
953 _mm256_storeu_ps(ptrE,tE);
954 _mm256_storeu_ps(ptrF,tF);
955 _mm256_storeu_ps(ptrG,tG);
956 _mm256_storeu_ps(ptrH,tH);
958 tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8),_mm_loadu_ps(ptrA+8));
959 tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8),_mm_loadu_ps(ptrB+8));
960 tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8),_mm_loadu_ps(ptrC+8));
961 tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8),_mm_loadu_ps(ptrD+8));
963 t1 = _mm256_unpacklo_ps(z3,x4); /* x4f z3f x4e z3e | x4b z3b x4a z3a */
964 t2 = _mm256_unpackhi_ps(z3,x4); /* x4h z3h x4g z3g | x4d z3d x4c z3c */
965 t3 = _mm256_unpacklo_ps(y4,z4); /* z4f y4f z4e y4e | z4b y4b z4a y4a */
966 t4 = _mm256_unpackhi_ps(y4,z4); /* z4h y4h z4g y4g | z4d y4d z4c y4c */
968 t5 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); /* z4e y4e x4e z3e | z4a y4a x4a z3a */
969 t6 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); /* z4f y4f x4f z3f | z4b y4b x4b z3b */
970 t7 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(1,0,1,0)); /* z4g y4g x4g z3g | z4c y4c x4c z3c */
971 t8 = _mm256_shuffle_ps(t2,t4,_MM_SHUFFLE(3,2,3,2)); /* z4h y4h x4h z3h | z4d y4d x4d z3d */
973 tI = _mm256_sub_ps(tI,t5);
974 tJ = _mm256_sub_ps(tJ,t6);
975 tK = _mm256_sub_ps(tK,t7);
976 tL = _mm256_sub_ps(tL,t8);
978 _mm_storeu_ps(ptrA+8,_mm256_castps256_ps128(tI));
979 _mm_storeu_ps(ptrB+8,_mm256_castps256_ps128(tJ));
980 _mm_storeu_ps(ptrC+8,_mm256_castps256_ps128(tK));
981 _mm_storeu_ps(ptrD+8,_mm256_castps256_ps128(tL));
982 _mm_storeu_ps(ptrE+8,_mm256_extractf128_ps(tI,0x1));
983 _mm_storeu_ps(ptrF+8,_mm256_extractf128_ps(tJ,0x1));
984 _mm_storeu_ps(ptrG+8,_mm256_extractf128_ps(tK,0x1));
985 _mm_storeu_ps(ptrH+8,_mm256_extractf128_ps(tL,0x1));
990 static gmx_inline void
991 gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
992 float * gmx_restrict fptr,
993 float * gmx_restrict fshiftptr)
997 fix1 = _mm256_hadd_ps(fix1,fix1);
998 fiy1 = _mm256_hadd_ps(fiy1,fiz1);
999 fix1 = _mm256_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 (in both lanes) */
1001 /* Add across the two lanes */
1002 t1 = _mm_add_ps(_mm256_castps256_ps128(fix1),_mm256_extractf128_ps(fix1,0x1));
1004 t2 = _mm_load_ss(fptr);
1005 t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
1006 t3 = _mm_load_ss(fshiftptr);
1007 t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
1009 t2 = _mm_add_ps(t2,t1);
1010 t3 = _mm_add_ps(t3,t1);
1012 _mm_store_ss(fptr,t2);
1013 _mm_storeh_pi((__m64 *)(fptr+1),t2);
1014 _mm_store_ss(fshiftptr,t3);
1015 _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
1018 static gmx_inline void
1019 gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1020 __m256 fix2, __m256 fiy2, __m256 fiz2,
1021 __m256 fix3, __m256 fiy3, __m256 fiz3,
1022 float * gmx_restrict fptr,
1023 float * gmx_restrict fshiftptr)
1028 fix1 = _mm256_hadd_ps(fix1,fiy1); /* Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1029 fiz1 = _mm256_hadd_ps(fiz1,fix2); /* X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1030 fiy2 = _mm256_hadd_ps(fiy2,fiz2); /* Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1031 fix3 = _mm256_hadd_ps(fix3,fiy3); /* Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1032 fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps()); /* 0 0 Z3g+Z3h Z3e+Z3f | 0 0 Z3c+Z3d Z3a+Z3b */
1034 fix1 = _mm256_hadd_ps(fix1,fiz1); /* X2e-h Z1e-h Y1e-h X1e-h | X2a-d Z1a-d Y1a-d X1a-d */
1035 fiy2 = _mm256_hadd_ps(fiy2,fix3); /* Y3e-h X3e-h Z2e-h Y2e-h | Y3a-d X3a-d Z2a-d Y2a-d */
1036 fiz3 = _mm256_hadd_ps(fiz3,_mm256_setzero_ps()); /* 0 0 0 Z3e-h | 0 0 0 Z3a-d */
1038 /* Add across the two lanes by swapping and adding back */
1039 t1 = gmx_mm256_unpack128lo_ps(fix1,fiy2); /* Y3a-d X3a-d Z2a-d Y2a-d | X2a-d Z1a-d Y1a-d X1a-d */
1040 t2 = gmx_mm256_unpack128hi_ps(fix1,fiy2); /* Y3e-h X3e-h Z2e-h Y2e-h | X2e-h Z1e-h Y1e-h X1e-h */
1041 t1 = _mm256_add_ps(t1,t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1043 tA = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1)); /* 0 0 0 z3 */
1045 t3 = _mm256_loadu_ps(fptr);
1046 t3 = _mm256_add_ps(t3,t1);
1047 _mm256_storeu_ps(fptr,t3);
1048 tB = _mm_load_ss(fptr+8);
1049 tB = _mm_add_ss(tB,tA);
1050 _mm_store_ss(fptr+8,tB);
1052 /* Add up shift force */
1053 tB = _mm256_extractf128_ps(t1,0x1); /* y3 x3 z2 y2 */
1054 tC = _mm_shuffle_ps(_mm256_castps256_ps128(t1),tB,_MM_SHUFFLE(1,0,3,3)); /* z2 y2 x2 x2 */
1055 tB = _mm_shuffle_ps(tB,tA,_MM_SHUFFLE(1,0,3,2)); /* 0 z3 y3 x3 */
1056 tC = _mm_permute_ps(tC,_MM_SHUFFLE(3,3,2,0)); /* - z2 y2 x2 */
1058 tB = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
1059 tA = _mm_add_ps(tB,tC); /* - z y x */
1061 tA = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
1063 tC = _mm_loadu_ps(fshiftptr);
1064 tC = _mm_add_ps(tC,tA);
1065 _mm_storeu_ps(fshiftptr,tC);
1069 static gmx_inline void
1070 gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
1071 __m256 fix2, __m256 fiy2, __m256 fiz2,
1072 __m256 fix3, __m256 fiy3, __m256 fiz3,
1073 __m256 fix4, __m256 fiy4, __m256 fiz4,
1074 float * gmx_restrict fptr,
1075 float * gmx_restrict fshiftptr)
1080 fix1 = _mm256_hadd_ps(fix1,fiy1); /* Y1g+Y1h Y1e+Y1f X1g+X1h X1e+X1f | Y1c+Y1d Y1a+Y1b X1c+X1d X1a+X1b */
1081 fiz1 = _mm256_hadd_ps(fiz1,fix2); /* X2g+X2h X2e+X2f Z1g+Z1h Z1e+Z1f | X2c+X2d X2a+X2b Z1c+Z1d Z1a+Z1b */
1082 fiy2 = _mm256_hadd_ps(fiy2,fiz2); /* Z2g+Z2h Z2e+Z2f Y2g+Y2h Y2e+Y2f | Z2c+Z2d Z2a+Z2b Y2c+Y2d Y2a+Y2b */
1083 fix3 = _mm256_hadd_ps(fix3,fiy3); /* Y3g+Y3h Y3e+Y3f X3g+X3h X3e+X3f | Y3c+Y3d Y3a+Y3b X3c+X3d X3a+X3b */
1084 fiz3 = _mm256_hadd_ps(fiz3,fix4); /* X4g+X4h X4e+X4f Z3g+Z3h Z3e+Z3f | X4c+X4d X4a+X4b Z3c+Z3d Z3a+Z3b */
1085 fiy4 = _mm256_hadd_ps(fiy4,fiz4); /* Z4g+Z4h Z4e+Z4f Y4g+Y4h Y4e+Y4f | Z4c+Z4d Z4a+Z4b Y4c+Y4d Y4a+Y4b */
1087 fix1 = _mm256_hadd_ps(fix1,fiz1); /* X2e-h Z1e-h Y1e-h X1e-h | X2a-d Z1a-d Y1a-d X1a-d */
1088 fiy2 = _mm256_hadd_ps(fiy2,fix3); /* Y3e-h X3e-h Z2e-h Y2e-h | Y3a-d X3a-d Z2a-d Y2a-d */
1089 fiz3 = _mm256_hadd_ps(fiz3,fiy4); /* Z4e-h Y4e-h X4e-h Z3e-h | Z4a-d Y4a-d X4a-d Z3a-d */
1091 /* Add across the two lanes by swapping and adding back */
1092 t1 = gmx_mm256_unpack128lo_ps(fix1,fiy2); /* Y3a-d X3a-d Z2a-d Y2a-d | X2a-d Z1a-d Y1a-d X1a-d */
1093 t2 = gmx_mm256_unpack128hi_ps(fix1,fiy2); /* Y3e-h X3e-h Z2e-h Y2e-h | X2e-h Z1e-h Y1e-h X1e-h */
1094 t1 = _mm256_add_ps(t1,t2); /* y3 x3 z2 y2 | x2 z1 y1 x1 */
1096 tA = _mm_add_ps(_mm256_castps256_ps128(fiz3),_mm256_extractf128_ps(fiz3,0x1)); /* z4 y4 x4 z3 */
1098 t3 = _mm256_loadu_ps(fptr);
1099 t3 = _mm256_add_ps(t3,t1);
1100 _mm256_storeu_ps(fptr,t3);
1102 tB = _mm_loadu_ps(fptr+8);
1103 tB = _mm_add_ps(tB,tA);
1104 _mm_storeu_ps(fptr+8,tB);
1106 /* Add up shift force */
1107 tB = _mm256_extractf128_ps(t1,0x1); /* y3 x3 z2 y2 */
1108 tC = _mm_shuffle_ps(_mm256_castps256_ps128(t1),tB,_MM_SHUFFLE(1,0,3,3)); /* z2 y2 x2 x2 */
1109 tB = _mm_shuffle_ps(tB,tA,_MM_SHUFFLE(1,0,3,2)); /* 0 z3 y3 x3 */
1110 tC = _mm_permute_ps(tC,_MM_SHUFFLE(3,3,2,0)); /* - z2 y2 x2 */
1111 tA = _mm_permute_ps(tA,_MM_SHUFFLE(0,3,2,1)); /* - z4 y4 x4 */
1113 tB = _mm_add_ps(tB,_mm256_castps256_ps128(t1));
1114 tA = _mm_add_ps(tA,tC);
1115 tA = _mm_add_ps(tA,tB);
1117 tA = _mm_blend_ps(_mm_setzero_ps(),tA,0x7); /* 0 z y x */
1119 tC = _mm_loadu_ps(fshiftptr);
1120 tC = _mm_add_ps(tC,tA);
1121 _mm_storeu_ps(fshiftptr,tC);
1126 static gmx_inline void
1127 gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
1131 pot1 = _mm256_hadd_ps(pot1,pot1);
1132 pot1 = _mm256_hadd_ps(pot1,pot1);
1134 t1 = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
1136 _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
1139 static gmx_inline void
1140 gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1141 __m256 pot2, float * gmx_restrict ptrB)
1145 pot1 = _mm256_hadd_ps(pot1,pot2);
1146 pot1 = _mm256_hadd_ps(pot1,pot1);
1148 t1 = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
1150 t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
1151 _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
1152 _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
1156 static gmx_inline void
1157 gmx_mm256_update_4pot_ps(__m256 pot1, float * gmx_restrict ptrA,
1158 __m256 pot2, float * gmx_restrict ptrB,
1159 __m256 pot3, float * gmx_restrict ptrC,
1160 __m256 pot4, float * gmx_restrict ptrD)
1164 pot1 = _mm256_hadd_ps(pot1,pot2);
1165 pot3 = _mm256_hadd_ps(pot3,pot4);
1166 pot1 = _mm256_hadd_ps(pot1,pot3);
1167 t1 = _mm_add_ps(_mm256_castps256_ps128(pot1),_mm256_extractf128_ps(pot1,0x1));
1168 t2 = _mm_permute_ps(t1,_MM_SHUFFLE(1,1,1,1));
1169 t3 = _mm_permute_ps(t1,_MM_SHUFFLE(2,2,2,2));
1170 t4 = _mm_permute_ps(t1,_MM_SHUFFLE(3,3,3,3));
1171 _mm_store_ss(ptrA,_mm_add_ss(_mm_load_ss(ptrA),t1));
1172 _mm_store_ss(ptrB,_mm_add_ss(_mm_load_ss(ptrB),t2));
1173 _mm_store_ss(ptrC,_mm_add_ss(_mm_load_ss(ptrC),t3));
1174 _mm_store_ss(ptrD,_mm_add_ss(_mm_load_ss(ptrD),t4));
1178 #endif /* _kernelutil_x86_avx_256_single_h_ */