2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_UTIL_FLOAT_H
45 #include <immintrin.h>
47 #include "gromacs/utility/basedefinitions.h"
49 #include "impl_x86_avx_256_simd_float.h"
54 /* This is an internal helper function used by decr3Hsimd(...).
56 static inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
58 assert(std::size_t(m) % 16 == 0);
59 __m128 asum = _mm_add_ps(_mm256_castps256_ps128(a.simdInternal_),
60 _mm256_extractf128_ps(a.simdInternal_, 0x1));
61 _mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
64 /* This is an internal helper function used by the three functions storing,
65 * incrementing, or decrementing data. Do NOT use it outside this file.
67 * Input v0: [x0 x1 x2 x3 x4 x5 x6 x7]
68 * Input v1: [y0 y1 y2 y3 y4 y5 y6 y7]
69 * Input v2: [z0 z1 z2 z3 z4 z5 z6 z7]
72 * Output v0: [x0 y0 z0 - x4 y4 z4 - ]
73 * Output v1: [x1 y1 z1 - x5 y5 z5 - ]
74 * Output v2: [x2 y2 z2 - x6 y6 z6 - ]
75 * Output v3: [x3 y3 z3 - x7 y7 z7 - ]
77 * Here, - means undefined. Note that such values will not be zero!
79 static inline void gmx_simdcall avx256Transpose3By4InLanes(__m256* v0, __m256* v1, __m256* v2, __m256* v3)
81 __m256 t1 = _mm256_unpacklo_ps(*v0, *v1);
82 __m256 t2 = _mm256_unpackhi_ps(*v0, *v1);
83 *v0 = _mm256_shuffle_ps(t1, *v2, _MM_SHUFFLE(0, 0, 1, 0));
84 *v1 = _mm256_shuffle_ps(t1, *v2, _MM_SHUFFLE(0, 1, 3, 2));
85 *v3 = _mm256_shuffle_ps(t2, *v2, _MM_SHUFFLE(0, 3, 3, 2));
86 *v2 = _mm256_shuffle_ps(t2, *v2, _MM_SHUFFLE(0, 2, 1, 0));
90 static inline void gmx_simdcall gatherLoadTranspose(const float* base,
91 const std::int32_t offset[],
97 __m128 t1, t2, t3, t4, t5, t6, t7, t8;
98 __m256 tA, tB, tC, tD;
100 assert(std::size_t(offset) % 32 == 0);
101 assert(std::size_t(base) % 16 == 0);
102 assert(align % 4 == 0);
104 t1 = _mm_load_ps(base + align * offset[0]);
105 t2 = _mm_load_ps(base + align * offset[1]);
106 t3 = _mm_load_ps(base + align * offset[2]);
107 t4 = _mm_load_ps(base + align * offset[3]);
108 t5 = _mm_load_ps(base + align * offset[4]);
109 t6 = _mm_load_ps(base + align * offset[5]);
110 t7 = _mm_load_ps(base + align * offset[6]);
111 t8 = _mm_load_ps(base + align * offset[7]);
113 v0->simdInternal_ = _mm256_insertf128_ps(_mm256_castps128_ps256(t1), t5, 0x1);
114 v1->simdInternal_ = _mm256_insertf128_ps(_mm256_castps128_ps256(t2), t6, 0x1);
115 v2->simdInternal_ = _mm256_insertf128_ps(_mm256_castps128_ps256(t3), t7, 0x1);
116 v3->simdInternal_ = _mm256_insertf128_ps(_mm256_castps128_ps256(t4), t8, 0x1);
118 tA = _mm256_unpacklo_ps(v0->simdInternal_, v1->simdInternal_);
119 tB = _mm256_unpacklo_ps(v2->simdInternal_, v3->simdInternal_);
120 tC = _mm256_unpackhi_ps(v0->simdInternal_, v1->simdInternal_);
121 tD = _mm256_unpackhi_ps(v2->simdInternal_, v3->simdInternal_);
123 v0->simdInternal_ = _mm256_shuffle_ps(tA, tB, _MM_SHUFFLE(1, 0, 1, 0));
124 v1->simdInternal_ = _mm256_shuffle_ps(tA, tB, _MM_SHUFFLE(3, 2, 3, 2));
125 v2->simdInternal_ = _mm256_shuffle_ps(tC, tD, _MM_SHUFFLE(1, 0, 1, 0));
126 v3->simdInternal_ = _mm256_shuffle_ps(tC, tD, _MM_SHUFFLE(3, 2, 3, 2));
130 static inline void gmx_simdcall
131 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
133 __m128 t1, t2, t3, t4, t5, t6, t7, t8;
134 __m256 tA, tB, tC, tD;
136 assert(std::size_t(offset) % 32 == 0);
137 assert(std::size_t(base) % 8 == 0);
138 assert(align % 2 == 0);
140 t1 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[0]));
141 t2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[1]));
142 t3 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[2]));
143 t4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[3]));
144 t5 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[4]));
145 t6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[5]));
146 t7 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[6]));
147 t8 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[7]));
149 tA = _mm256_insertf128_ps(_mm256_castps128_ps256(t1), t5, 0x1);
150 tB = _mm256_insertf128_ps(_mm256_castps128_ps256(t2), t6, 0x1);
151 tC = _mm256_insertf128_ps(_mm256_castps128_ps256(t3), t7, 0x1);
152 tD = _mm256_insertf128_ps(_mm256_castps128_ps256(t4), t8, 0x1);
154 tA = _mm256_unpacklo_ps(tA, tC);
155 tB = _mm256_unpacklo_ps(tB, tD);
156 v0->simdInternal_ = _mm256_unpacklo_ps(tA, tB);
157 v1->simdInternal_ = _mm256_unpackhi_ps(tA, tB);
160 static const int c_simdBestPairAlignmentFloat = 2;
162 // With the implementation below, thread-sanitizer can detect false positives.
163 // For loading a triplet, we load 4 floats and ignore the last. Another thread
164 // might write to this element, but that will not affect the result.
165 // On AVX2 we can use a gather intrinsic instead.
167 static inline void gmx_simdcall gatherLoadUTranspose(const float* base,
168 const std::int32_t offset[],
173 __m256 t1, t2, t3, t4, t5, t6, t7, t8;
175 assert(std::size_t(offset) % 32 == 0);
179 // we can use aligned loads since base should also be aligned in this case
180 assert(std::size_t(base) % 16 == 0);
181 t1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(base + align * offset[0])),
182 _mm_load_ps(base + align * offset[4]),
184 t2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(base + align * offset[1])),
185 _mm_load_ps(base + align * offset[5]),
187 t3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(base + align * offset[2])),
188 _mm_load_ps(base + align * offset[6]),
190 t4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(base + align * offset[3])),
191 _mm_load_ps(base + align * offset[7]),
196 // Use unaligned loads
197 t1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(base + align * offset[0])),
198 _mm_loadu_ps(base + align * offset[4]),
200 t2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(base + align * offset[1])),
201 _mm_loadu_ps(base + align * offset[5]),
203 t3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(base + align * offset[2])),
204 _mm_loadu_ps(base + align * offset[6]),
206 t4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(base + align * offset[3])),
207 _mm_loadu_ps(base + align * offset[7]),
211 t5 = _mm256_unpacklo_ps(t1, t2);
212 t6 = _mm256_unpacklo_ps(t3, t4);
213 t7 = _mm256_unpackhi_ps(t1, t2);
214 t8 = _mm256_unpackhi_ps(t3, t4);
215 v0->simdInternal_ = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(1, 0, 1, 0));
216 v1->simdInternal_ = _mm256_shuffle_ps(t5, t6, _MM_SHUFFLE(3, 2, 3, 2));
217 v2->simdInternal_ = _mm256_shuffle_ps(t7, t8, _MM_SHUFFLE(1, 0, 1, 0));
221 static inline void gmx_simdcall
222 transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
225 __m128i mask = _mm_set_epi32(0, -1, -1, -1);
227 assert(std::size_t(offset) % 32 == 0);
229 avx256Transpose3By4InLanes(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &tv3);
230 _mm_maskstore_ps(base + align * offset[0], mask, _mm256_castps256_ps128(v0.simdInternal_));
231 _mm_maskstore_ps(base + align * offset[1], mask, _mm256_castps256_ps128(v1.simdInternal_));
232 _mm_maskstore_ps(base + align * offset[2], mask, _mm256_castps256_ps128(v2.simdInternal_));
233 _mm_maskstore_ps(base + align * offset[3], mask, _mm256_castps256_ps128(tv3));
234 _mm_maskstore_ps(base + align * offset[4], mask, _mm256_extractf128_ps(v0.simdInternal_, 0x1));
235 _mm_maskstore_ps(base + align * offset[5], mask, _mm256_extractf128_ps(v1.simdInternal_, 0x1));
236 _mm_maskstore_ps(base + align * offset[6], mask, _mm256_extractf128_ps(v2.simdInternal_, 0x1));
237 _mm_maskstore_ps(base + align * offset[7], mask, _mm256_extractf128_ps(tv3, 0x1));
241 static inline void gmx_simdcall
242 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
244 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
245 __m128 tA, tB, tC, tD, tE, tF, tG, tH, tX;
249 t5 = _mm256_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
250 t6 = _mm256_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
251 t7 = _mm256_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
252 t8 = _mm256_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
253 t9 = _mm256_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
254 t10 = _mm256_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
256 tA = _mm256_castps256_ps128(t7);
257 tB = _mm256_castps256_ps128(t8);
258 tC = _mm256_castps256_ps128(t9);
259 tD = _mm256_castps256_ps128(t10);
260 tE = _mm256_extractf128_ps(t7, 0x1);
261 tF = _mm256_extractf128_ps(t8, 0x1);
262 tG = _mm256_extractf128_ps(t9, 0x1);
263 tH = _mm256_extractf128_ps(t10, 0x1);
265 tX = _mm_load_ss(base + align * offset[0]);
266 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[0] + 1));
267 tX = _mm_add_ps(tX, tA);
268 _mm_store_ss(base + align * offset[0], tX);
269 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[0] + 1), tX);
271 tX = _mm_load_ss(base + align * offset[1]);
272 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[1] + 1));
273 tX = _mm_add_ps(tX, tB);
274 _mm_store_ss(base + align * offset[1], tX);
275 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 1), tX);
277 tX = _mm_load_ss(base + align * offset[2]);
278 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[2] + 1));
279 tX = _mm_add_ps(tX, tC);
280 _mm_store_ss(base + align * offset[2], tX);
281 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[2] + 1), tX);
283 tX = _mm_load_ss(base + align * offset[3]);
284 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[3] + 1));
285 tX = _mm_add_ps(tX, tD);
286 _mm_store_ss(base + align * offset[3], tX);
287 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3] + 1), tX);
289 tX = _mm_load_ss(base + align * offset[4]);
290 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[4] + 1));
291 tX = _mm_add_ps(tX, tE);
292 _mm_store_ss(base + align * offset[4], tX);
293 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[4] + 1), tX);
295 tX = _mm_load_ss(base + align * offset[5]);
296 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[5] + 1));
297 tX = _mm_add_ps(tX, tF);
298 _mm_store_ss(base + align * offset[5], tX);
299 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[5] + 1), tX);
301 tX = _mm_load_ss(base + align * offset[6]);
302 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[6] + 1));
303 tX = _mm_add_ps(tX, tG);
304 _mm_store_ss(base + align * offset[6], tX);
305 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[6] + 1), tX);
307 tX = _mm_load_ss(base + align * offset[7]);
308 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[7] + 1));
309 tX = _mm_add_ps(tX, tH);
310 _mm_store_ss(base + align * offset[7], tX);
311 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[7] + 1), tX);
315 // Extra elements means we can use full width-4 load/store operations
316 t1 = _mm256_unpacklo_ps(v0.simdInternal_, v2.simdInternal_);
317 t2 = _mm256_unpackhi_ps(v0.simdInternal_, v2.simdInternal_);
318 t3 = _mm256_unpacklo_ps(v1.simdInternal_, _mm256_setzero_ps());
319 t4 = _mm256_unpackhi_ps(v1.simdInternal_, _mm256_setzero_ps());
320 t5 = _mm256_unpacklo_ps(t1, t3); // x0 y0 z0 0 | x4 y4 z4 0
321 t6 = _mm256_unpackhi_ps(t1, t3); // x1 y1 z1 0 | x5 y5 z5 0
322 t7 = _mm256_unpacklo_ps(t2, t4); // x2 y2 z2 0 | x6 y6 z6 0
323 t8 = _mm256_unpackhi_ps(t2, t4); // x3 y3 z3 0 | x7 y7 z7 0
327 // We can use aligned load & store
328 _mm_store_ps(base + align * offset[0],
329 _mm_add_ps(_mm_load_ps(base + align * offset[0]), _mm256_castps256_ps128(t5)));
330 _mm_store_ps(base + align * offset[1],
331 _mm_add_ps(_mm_load_ps(base + align * offset[1]), _mm256_castps256_ps128(t6)));
332 _mm_store_ps(base + align * offset[2],
333 _mm_add_ps(_mm_load_ps(base + align * offset[2]), _mm256_castps256_ps128(t7)));
334 _mm_store_ps(base + align * offset[3],
335 _mm_add_ps(_mm_load_ps(base + align * offset[3]), _mm256_castps256_ps128(t8)));
336 _mm_store_ps(base + align * offset[4],
337 _mm_add_ps(_mm_load_ps(base + align * offset[4]), _mm256_extractf128_ps(t5, 0x1)));
338 _mm_store_ps(base + align * offset[5],
339 _mm_add_ps(_mm_load_ps(base + align * offset[5]), _mm256_extractf128_ps(t6, 0x1)));
340 _mm_store_ps(base + align * offset[6],
341 _mm_add_ps(_mm_load_ps(base + align * offset[6]), _mm256_extractf128_ps(t7, 0x1)));
342 _mm_store_ps(base + align * offset[7],
343 _mm_add_ps(_mm_load_ps(base + align * offset[7]), _mm256_extractf128_ps(t8, 0x1)));
347 // alignment >=5, but not a multiple of 4
348 _mm_storeu_ps(base + align * offset[0],
349 _mm_add_ps(_mm_loadu_ps(base + align * offset[0]), _mm256_castps256_ps128(t5)));
350 _mm_storeu_ps(base + align * offset[1],
351 _mm_add_ps(_mm_loadu_ps(base + align * offset[1]), _mm256_castps256_ps128(t6)));
352 _mm_storeu_ps(base + align * offset[2],
353 _mm_add_ps(_mm_loadu_ps(base + align * offset[2]), _mm256_castps256_ps128(t7)));
354 _mm_storeu_ps(base + align * offset[3],
355 _mm_add_ps(_mm_loadu_ps(base + align * offset[3]), _mm256_castps256_ps128(t8)));
357 base + align * offset[4],
358 _mm_add_ps(_mm_loadu_ps(base + align * offset[4]), _mm256_extractf128_ps(t5, 0x1)));
360 base + align * offset[5],
361 _mm_add_ps(_mm_loadu_ps(base + align * offset[5]), _mm256_extractf128_ps(t6, 0x1)));
363 base + align * offset[6],
364 _mm_add_ps(_mm_loadu_ps(base + align * offset[6]), _mm256_extractf128_ps(t7, 0x1)));
366 base + align * offset[7],
367 _mm_add_ps(_mm_loadu_ps(base + align * offset[7]), _mm256_extractf128_ps(t8, 0x1)));
373 static inline void gmx_simdcall
374 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
376 __m256 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
377 __m128 tA, tB, tC, tD, tE, tF, tG, tH, tX;
381 t5 = _mm256_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
382 t6 = _mm256_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
383 t7 = _mm256_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
384 t8 = _mm256_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
385 t9 = _mm256_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
386 t10 = _mm256_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
388 tA = _mm256_castps256_ps128(t7);
389 tB = _mm256_castps256_ps128(t8);
390 tC = _mm256_castps256_ps128(t9);
391 tD = _mm256_castps256_ps128(t10);
392 tE = _mm256_extractf128_ps(t7, 0x1);
393 tF = _mm256_extractf128_ps(t8, 0x1);
394 tG = _mm256_extractf128_ps(t9, 0x1);
395 tH = _mm256_extractf128_ps(t10, 0x1);
397 tX = _mm_load_ss(base + align * offset[0]);
398 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[0] + 1));
399 tX = _mm_sub_ps(tX, tA);
400 _mm_store_ss(base + align * offset[0], tX);
401 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[0] + 1), tX);
403 tX = _mm_load_ss(base + align * offset[1]);
404 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[1] + 1));
405 tX = _mm_sub_ps(tX, tB);
406 _mm_store_ss(base + align * offset[1], tX);
407 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 1), tX);
409 tX = _mm_load_ss(base + align * offset[2]);
410 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[2] + 1));
411 tX = _mm_sub_ps(tX, tC);
412 _mm_store_ss(base + align * offset[2], tX);
413 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[2] + 1), tX);
415 tX = _mm_load_ss(base + align * offset[3]);
416 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[3] + 1));
417 tX = _mm_sub_ps(tX, tD);
418 _mm_store_ss(base + align * offset[3], tX);
419 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3] + 1), tX);
421 tX = _mm_load_ss(base + align * offset[4]);
422 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[4] + 1));
423 tX = _mm_sub_ps(tX, tE);
424 _mm_store_ss(base + align * offset[4], tX);
425 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[4] + 1), tX);
427 tX = _mm_load_ss(base + align * offset[5]);
428 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[5] + 1));
429 tX = _mm_sub_ps(tX, tF);
430 _mm_store_ss(base + align * offset[5], tX);
431 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[5] + 1), tX);
433 tX = _mm_load_ss(base + align * offset[6]);
434 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[6] + 1));
435 tX = _mm_sub_ps(tX, tG);
436 _mm_store_ss(base + align * offset[6], tX);
437 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[6] + 1), tX);
439 tX = _mm_load_ss(base + align * offset[7]);
440 tX = _mm_loadh_pi(tX, reinterpret_cast<__m64*>(base + align * offset[7] + 1));
441 tX = _mm_sub_ps(tX, tH);
442 _mm_store_ss(base + align * offset[7], tX);
443 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[7] + 1), tX);
447 // Extra elements means we can use full width-4 load/store operations
448 t1 = _mm256_unpacklo_ps(v0.simdInternal_, v2.simdInternal_);
449 t2 = _mm256_unpackhi_ps(v0.simdInternal_, v2.simdInternal_);
450 t3 = _mm256_unpacklo_ps(v1.simdInternal_, _mm256_setzero_ps());
451 t4 = _mm256_unpackhi_ps(v1.simdInternal_, _mm256_setzero_ps());
452 t5 = _mm256_unpacklo_ps(t1, t3); // x0 y0 z0 0 | x4 y4 z4 0
453 t6 = _mm256_unpackhi_ps(t1, t3); // x1 y1 z1 0 | x5 y5 z5 0
454 t7 = _mm256_unpacklo_ps(t2, t4); // x2 y2 z2 0 | x6 y6 z6 0
455 t8 = _mm256_unpackhi_ps(t2, t4); // x3 y3 z3 0 | x7 y7 z7 0
459 // We can use aligned load & store
460 _mm_store_ps(base + align * offset[0],
461 _mm_sub_ps(_mm_load_ps(base + align * offset[0]), _mm256_castps256_ps128(t5)));
462 _mm_store_ps(base + align * offset[1],
463 _mm_sub_ps(_mm_load_ps(base + align * offset[1]), _mm256_castps256_ps128(t6)));
464 _mm_store_ps(base + align * offset[2],
465 _mm_sub_ps(_mm_load_ps(base + align * offset[2]), _mm256_castps256_ps128(t7)));
466 _mm_store_ps(base + align * offset[3],
467 _mm_sub_ps(_mm_load_ps(base + align * offset[3]), _mm256_castps256_ps128(t8)));
468 _mm_store_ps(base + align * offset[4],
469 _mm_sub_ps(_mm_load_ps(base + align * offset[4]), _mm256_extractf128_ps(t5, 0x1)));
470 _mm_store_ps(base + align * offset[5],
471 _mm_sub_ps(_mm_load_ps(base + align * offset[5]), _mm256_extractf128_ps(t6, 0x1)));
472 _mm_store_ps(base + align * offset[6],
473 _mm_sub_ps(_mm_load_ps(base + align * offset[6]), _mm256_extractf128_ps(t7, 0x1)));
474 _mm_store_ps(base + align * offset[7],
475 _mm_sub_ps(_mm_load_ps(base + align * offset[7]), _mm256_extractf128_ps(t8, 0x1)));
479 // alignment >=5, but not a multiple of 4
480 _mm_storeu_ps(base + align * offset[0],
481 _mm_sub_ps(_mm_loadu_ps(base + align * offset[0]), _mm256_castps256_ps128(t5)));
482 _mm_storeu_ps(base + align * offset[1],
483 _mm_sub_ps(_mm_loadu_ps(base + align * offset[1]), _mm256_castps256_ps128(t6)));
484 _mm_storeu_ps(base + align * offset[2],
485 _mm_sub_ps(_mm_loadu_ps(base + align * offset[2]), _mm256_castps256_ps128(t7)));
486 _mm_storeu_ps(base + align * offset[3],
487 _mm_sub_ps(_mm_loadu_ps(base + align * offset[3]), _mm256_castps256_ps128(t8)));
489 base + align * offset[4],
490 _mm_sub_ps(_mm_loadu_ps(base + align * offset[4]), _mm256_extractf128_ps(t5, 0x1)));
492 base + align * offset[5],
493 _mm_sub_ps(_mm_loadu_ps(base + align * offset[5]), _mm256_extractf128_ps(t6, 0x1)));
495 base + align * offset[6],
496 _mm_sub_ps(_mm_loadu_ps(base + align * offset[6]), _mm256_extractf128_ps(t7, 0x1)));
498 base + align * offset[7],
499 _mm_sub_ps(_mm_loadu_ps(base + align * offset[7]), _mm256_extractf128_ps(t8, 0x1)));
504 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat scalar,
505 SimdFloat* triplets0,
506 SimdFloat* triplets1,
507 SimdFloat* triplets2)
509 __m256 t0 = _mm256_permute2f128_ps(scalar.simdInternal_, scalar.simdInternal_, 0x21);
510 __m256 t1 = _mm256_permute_ps(scalar.simdInternal_, _MM_SHUFFLE(1, 0, 0, 0));
511 __m256 t2 = _mm256_permute_ps(t0, _MM_SHUFFLE(2, 2, 1, 1));
512 __m256 t3 = _mm256_permute_ps(scalar.simdInternal_, _MM_SHUFFLE(3, 3, 3, 2));
513 triplets0->simdInternal_ = _mm256_blend_ps(t1, t2, 0xF0);
514 triplets1->simdInternal_ = _mm256_blend_ps(t3, t1, 0xF0);
515 triplets2->simdInternal_ = _mm256_blend_ps(t2, t3, 0xF0);
519 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
520 SimdFInt32 simdoffset,
526 alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset[GMX_SIMD_FLOAT_WIDTH];
527 _mm256_store_si256(reinterpret_cast<__m256i*>(offset), simdoffset.simdInternal_);
528 gatherLoadTranspose<align>(base, offset, v0, v1, v2, v3);
532 static inline void gmx_simdcall
533 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
535 alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset[GMX_SIMD_FLOAT_WIDTH];
536 _mm256_store_si256(reinterpret_cast<__m256i*>(offset), simdoffset.simdInternal_);
537 gatherLoadTranspose<align>(base, offset, v0, v1);
542 static inline void gmx_simdcall
543 gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
545 __m128 t1, t2, t3, t4, t5, t6, t7, t8;
546 __m256 tA, tB, tC, tD;
548 alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset[GMX_SIMD_FLOAT_WIDTH];
549 _mm256_store_si256(reinterpret_cast<__m256i*>(offset), simdoffset.simdInternal_);
551 t1 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[0]));
552 t2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[1]));
553 t3 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[2]));
554 t4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[3]));
555 t5 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[4]));
556 t6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[5]));
557 t7 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[6]));
558 t8 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base + align * offset[7]));
560 tA = _mm256_insertf128_ps(_mm256_castps128_ps256(t1), t5, 0x1);
561 tB = _mm256_insertf128_ps(_mm256_castps128_ps256(t2), t6, 0x1);
562 tC = _mm256_insertf128_ps(_mm256_castps128_ps256(t3), t7, 0x1);
563 tD = _mm256_insertf128_ps(_mm256_castps128_ps256(t4), t8, 0x1);
565 tA = _mm256_unpacklo_ps(tA, tC);
566 tB = _mm256_unpacklo_ps(tB, tD);
567 v0->simdInternal_ = _mm256_unpacklo_ps(tA, tB);
568 v1->simdInternal_ = _mm256_unpackhi_ps(tA, tB);
571 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
575 assert(std::size_t(m) % 16 == 0);
577 v0.simdInternal_ = _mm256_hadd_ps(v0.simdInternal_, v1.simdInternal_);
578 v2.simdInternal_ = _mm256_hadd_ps(v2.simdInternal_, v3.simdInternal_);
579 v0.simdInternal_ = _mm256_hadd_ps(v0.simdInternal_, v2.simdInternal_);
580 t0 = _mm_add_ps(_mm256_castps256_ps128(v0.simdInternal_), _mm256_extractf128_ps(v0.simdInternal_, 0x1));
582 t2 = _mm_add_ps(t0, _mm_load_ps(m));
585 t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
586 t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
587 return *reinterpret_cast<float*>(&t0);
591 /*************************************
592 * Half-simd-width utility functions *
593 *************************************/
594 static inline SimdFloat gmx_simdcall loadDualHsimd(const float* m0, const float* m1)
596 assert(std::size_t(m0) % 16 == 0);
597 assert(std::size_t(m1) % 16 == 0);
599 return { _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(m0)), _mm_load_ps(m1), 0x1) };
602 static inline SimdFloat gmx_simdcall loadDuplicateHsimd(const float* m)
604 assert(std::size_t(m) % 16 == 0);
606 return { _mm256_broadcast_ps(reinterpret_cast<const __m128*>(m)) };
609 static inline SimdFloat gmx_simdcall loadU1DualHsimd(const float* m)
612 t0 = _mm_broadcast_ss(m);
613 t1 = _mm_broadcast_ss(m + 1);
614 return { _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1) };
618 static inline void gmx_simdcall storeDualHsimd(float* m0, float* m1, SimdFloat a)
620 assert(std::size_t(m0) % 16 == 0);
621 assert(std::size_t(m1) % 16 == 0);
622 _mm_store_ps(m0, _mm256_castps256_ps128(a.simdInternal_));
623 _mm_store_ps(m1, _mm256_extractf128_ps(a.simdInternal_, 0x1));
626 static inline void gmx_simdcall incrDualHsimd(float* m0, float* m1, SimdFloat a)
628 assert(std::size_t(m0) % 16 == 0);
629 assert(std::size_t(m1) % 16 == 0);
630 _mm_store_ps(m0, _mm_add_ps(_mm256_castps256_ps128(a.simdInternal_), _mm_load_ps(m0)));
631 _mm_store_ps(m1, _mm_add_ps(_mm256_extractf128_ps(a.simdInternal_, 0x1), _mm_load_ps(m1)));
634 static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
636 assert(std::size_t(m) % 16 == 0);
638 decrHsimd(m + GMX_SIMD_FLOAT_WIDTH / 2, a1);
639 decrHsimd(m + GMX_SIMD_FLOAT_WIDTH, a2);
644 static inline void gmx_simdcall gatherLoadTransposeHsimd(const float* base0,
646 const std::int32_t offset[],
650 __m128 t0, t1, t2, t3, t4, t5, t6, t7;
651 __m256 tA, tB, tC, tD;
653 assert(std::size_t(offset) % 16 == 0);
654 assert(std::size_t(base0) % 8 == 0);
655 assert(std::size_t(base1) % 8 == 0);
656 assert(align % 2 == 0);
658 t0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base0 + align * offset[0]));
659 t1 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base0 + align * offset[1]));
660 t2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base0 + align * offset[2]));
661 t3 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base0 + align * offset[3]));
662 t4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base1 + align * offset[0]));
663 t5 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base1 + align * offset[1]));
664 t6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base1 + align * offset[2]));
665 t7 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(base1 + align * offset[3]));
667 tA = _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t4, 0x1);
668 tB = _mm256_insertf128_ps(_mm256_castps128_ps256(t1), t5, 0x1);
669 tC = _mm256_insertf128_ps(_mm256_castps128_ps256(t2), t6, 0x1);
670 tD = _mm256_insertf128_ps(_mm256_castps128_ps256(t3), t7, 0x1);
672 tA = _mm256_unpacklo_ps(tA, tC);
673 tB = _mm256_unpacklo_ps(tB, tD);
674 v0->simdInternal_ = _mm256_unpacklo_ps(tA, tB);
675 v1->simdInternal_ = _mm256_unpackhi_ps(tA, tB);
679 static inline float gmx_simdcall reduceIncr4ReturnSumHsimd(float* m, SimdFloat v0, SimdFloat v1)
683 v0.simdInternal_ = _mm256_hadd_ps(v0.simdInternal_, v1.simdInternal_);
684 t0 = _mm256_extractf128_ps(v0.simdInternal_, 0x1);
685 t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0.simdInternal_), t0);
686 t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
688 assert(std::size_t(m) % 16 == 0);
690 t1 = _mm_add_ps(t0, _mm_load_ps(m));
693 t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
694 t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
695 return *reinterpret_cast<float*>(&t0);
698 static inline SimdFloat gmx_simdcall loadU4NOffset(const float* m, int offset)
700 return { _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(m)), _mm_loadu_ps(m + offset), 0x1) };
706 #endif // GMX_SIMD_IMPL_X86_AVX_256_UTIL_FLOAT_H