2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
45 #include <immintrin.h>
47 #include "gromacs/utility/basedefinitions.h"
49 #include "impl_x86_avx_256_simd_double.h"
54 // Internal utility function: Full 4x4 transpose of __m256d
55 static inline void gmx_simdcall avx256Transpose4By4(__m256d* v0, __m256d* v1, __m256d* v2, __m256d* v3)
57 __m256d t1 = _mm256_unpacklo_pd(*v0, *v1);
58 __m256d t2 = _mm256_unpackhi_pd(*v0, *v1);
59 __m256d t3 = _mm256_unpacklo_pd(*v2, *v3);
60 __m256d t4 = _mm256_unpackhi_pd(*v2, *v3);
61 *v0 = _mm256_permute2f128_pd(t1, t3, 0x20);
62 *v1 = _mm256_permute2f128_pd(t2, t4, 0x20);
63 *v2 = _mm256_permute2f128_pd(t1, t3, 0x31);
64 *v3 = _mm256_permute2f128_pd(t2, t4, 0x31);
68 static inline void gmx_simdcall gatherLoadTranspose(const double* base,
69 const std::int32_t offset[],
75 assert(std::size_t(offset) % 16 == 0);
76 assert(std::size_t(base) % 32 == 0);
77 assert(align % 4 == 0);
79 v0->simdInternal_ = _mm256_load_pd(base + align * offset[0]);
80 v1->simdInternal_ = _mm256_load_pd(base + align * offset[1]);
81 v2->simdInternal_ = _mm256_load_pd(base + align * offset[2]);
82 v3->simdInternal_ = _mm256_load_pd(base + align * offset[3]);
83 avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
87 static inline void gmx_simdcall
88 gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
90 __m128d t1, t2, t3, t4;
93 assert(std::size_t(offset) % 16 == 0);
94 assert(std::size_t(base) % 16 == 0);
95 assert(align % 2 == 0);
97 t1 = _mm_load_pd(base + align * offset[0]);
98 t2 = _mm_load_pd(base + align * offset[1]);
99 t3 = _mm_load_pd(base + align * offset[2]);
100 t4 = _mm_load_pd(base + align * offset[3]);
101 tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
102 tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
104 v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
105 v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
108 static const int c_simdBestPairAlignmentDouble = 2;
110 // With the implementation below, thread-sanitizer can detect false positives.
111 // For loading a triplet, we load 4 floats and ignore the last. Another thread
112 // might write to this element, but that will not affect the result.
113 // On AVX2 we can use a gather intrinsic instead.
115 static inline void gmx_simdcall gatherLoadUTranspose(const double* base,
116 const std::int32_t offset[],
121 assert(std::size_t(offset) % 16 == 0);
123 __m256d t1, t2, t3, t4, t5, t6, t7, t8;
126 t1 = _mm256_load_pd(base + align * offset[0]);
127 t2 = _mm256_load_pd(base + align * offset[1]);
128 t3 = _mm256_load_pd(base + align * offset[2]);
129 t4 = _mm256_load_pd(base + align * offset[3]);
133 t1 = _mm256_loadu_pd(base + align * offset[0]);
134 t2 = _mm256_loadu_pd(base + align * offset[1]);
135 t3 = _mm256_loadu_pd(base + align * offset[2]);
136 t4 = _mm256_loadu_pd(base + align * offset[3]);
138 t5 = _mm256_unpacklo_pd(t1, t2);
139 t6 = _mm256_unpackhi_pd(t1, t2);
140 t7 = _mm256_unpacklo_pd(t3, t4);
141 t8 = _mm256_unpackhi_pd(t3, t4);
142 v0->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x20);
143 v1->simdInternal_ = _mm256_permute2f128_pd(t6, t8, 0x20);
144 v2->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x31);
148 static inline void gmx_simdcall transposeScatterStoreU(double* base,
149 const std::int32_t offset[],
157 assert(std::size_t(offset) % 16 == 0);
163 t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
164 t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
165 t2 = _mm256_unpackhi_pd(v2.simdInternal_, v2.simdInternal_); // z1 z1 | z3 z3
167 _mm_storeu_pd(base + align * offset[0], _mm256_castpd256_pd128(t0));
168 _mm_storeu_pd(base + align * offset[1], _mm256_castpd256_pd128(t1));
169 _mm_storeu_pd(base + align * offset[2], _mm256_extractf128_pd(t0, 0x1));
170 _mm_storeu_pd(base + align * offset[3], _mm256_extractf128_pd(t1, 0x1));
171 _mm_store_sd(base + align * offset[0] + 2, _mm256_castpd256_pd128(v2.simdInternal_));
172 _mm_store_sd(base + align * offset[1] + 2, _mm256_castpd256_pd128(t2));
173 _mm_store_sd(base + align * offset[2] + 2, _mm256_extractf128_pd(v2.simdInternal_, 0x1));
174 _mm_store_sd(base + align * offset[3] + 2, _mm256_extractf128_pd(t2, 0x1));
178 static inline void gmx_simdcall
179 transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
184 assert(std::size_t(offset) % 16 == 0);
188 // we can use aligned load/store
189 t0 = _mm256_setzero_pd();
190 avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
191 _mm256_store_pd(base + align * offset[0],
192 _mm256_add_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
193 _mm256_store_pd(base + align * offset[1],
194 _mm256_add_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
195 _mm256_store_pd(base + align * offset[2],
196 _mm256_add_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
197 _mm256_store_pd(base + align * offset[3],
198 _mm256_add_pd(_mm256_load_pd(base + align * offset[3]), t0));
206 t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
207 t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
208 t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1); // z2 z3
210 tA = _mm_loadu_pd(base + align * offset[0]);
211 tB = _mm_load_sd(base + align * offset[0] + 2);
212 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t0));
213 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
214 _mm_storeu_pd(base + align * offset[0], tA);
215 _mm_store_sd(base + align * offset[0] + 2, tB);
217 tA = _mm_loadu_pd(base + align * offset[1]);
218 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
219 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t1));
220 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
221 _mm_storeu_pd(base + align * offset[1], tA);
222 _mm_storeh_pd(base + align * offset[1] + 2, tB);
224 tA = _mm_loadu_pd(base + align * offset[2]);
225 tB = _mm_load_sd(base + align * offset[2] + 2);
226 tA = _mm_add_pd(tA, _mm256_extractf128_pd(t0, 0x1));
227 tB = _mm_add_pd(tB, t2);
228 _mm_storeu_pd(base + align * offset[2], tA);
229 _mm_store_sd(base + align * offset[2] + 2, tB);
231 tA = _mm_loadu_pd(base + align * offset[3]);
232 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
233 tA = _mm_add_pd(tA, _mm256_extractf128_pd(t1, 0x1));
234 tB = _mm_add_pd(tB, t2);
235 _mm_storeu_pd(base + align * offset[3], tA);
236 _mm_storeh_pd(base + align * offset[3] + 2, tB);
240 static inline void gmx_simdcall
241 transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
246 assert(std::size_t(offset) % 16 == 0);
250 // we can use aligned load/store
251 t0 = _mm256_setzero_pd();
252 avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
253 _mm256_store_pd(base + align * offset[0],
254 _mm256_sub_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
255 _mm256_store_pd(base + align * offset[1],
256 _mm256_sub_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
257 _mm256_store_pd(base + align * offset[2],
258 _mm256_sub_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
259 _mm256_store_pd(base + align * offset[3],
260 _mm256_sub_pd(_mm256_load_pd(base + align * offset[3]), t0));
268 t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
269 t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
270 t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1); // z2 z3
272 tA = _mm_loadu_pd(base + align * offset[0]);
273 tB = _mm_load_sd(base + align * offset[0] + 2);
274 tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t0));
275 tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
276 _mm_storeu_pd(base + align * offset[0], tA);
277 _mm_store_sd(base + align * offset[0] + 2, tB);
279 tA = _mm_loadu_pd(base + align * offset[1]);
280 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
281 tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t1));
282 tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
283 _mm_storeu_pd(base + align * offset[1], tA);
284 _mm_storeh_pd(base + align * offset[1] + 2, tB);
286 tA = _mm_loadu_pd(base + align * offset[2]);
287 tB = _mm_load_sd(base + align * offset[2] + 2);
288 tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t0, 0x1));
289 tB = _mm_sub_pd(tB, t2);
290 _mm_storeu_pd(base + align * offset[2], tA);
291 _mm_store_sd(base + align * offset[2] + 2, tB);
293 tA = _mm_loadu_pd(base + align * offset[3]);
294 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
295 tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t1, 0x1));
296 tB = _mm_sub_pd(tB, t2);
297 _mm_storeu_pd(base + align * offset[3], tA);
298 _mm_storeh_pd(base + align * offset[3] + 2, tB);
302 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble scalar,
303 SimdDouble* triplets0,
304 SimdDouble* triplets1,
305 SimdDouble* triplets2)
307 __m256d t0 = _mm256_permute2f128_pd(scalar.simdInternal_, scalar.simdInternal_, 0x21);
308 __m256d t1 = _mm256_permute_pd(scalar.simdInternal_, 0b0000);
309 __m256d t2 = _mm256_permute_pd(scalar.simdInternal_, 0b1111);
310 triplets0->simdInternal_ = _mm256_blend_pd(t1, t0, 0b1100);
311 triplets1->simdInternal_ = _mm256_blend_pd(t2, t1, 0b1100);
312 triplets2->simdInternal_ = _mm256_blend_pd(t0, t2, 0b1100);
316 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
323 assert(std::size_t(base) % 32 == 0);
324 assert(align % 4 == 0);
326 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
327 _mm_store_si128(reinterpret_cast<__m128i*>(ioffset), offset.simdInternal_);
329 v0->simdInternal_ = _mm256_load_pd(base + align * ioffset[0]);
330 v1->simdInternal_ = _mm256_load_pd(base + align * ioffset[1]);
331 v2->simdInternal_ = _mm256_load_pd(base + align * ioffset[2]);
332 v3->simdInternal_ = _mm256_load_pd(base + align * ioffset[3]);
334 avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
338 static inline void gmx_simdcall
339 gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
341 __m128d t1, t2, t3, t4;
344 assert(std::size_t(base) % 16 == 0);
345 assert(align % 2 == 0);
347 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
348 _mm_store_si128(reinterpret_cast<__m128i*>(ioffset), offset.simdInternal_);
350 t1 = _mm_load_pd(base + align * ioffset[0]);
351 t2 = _mm_load_pd(base + align * ioffset[1]);
352 t3 = _mm_load_pd(base + align * ioffset[2]);
353 t4 = _mm_load_pd(base + align * ioffset[3]);
355 tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
356 tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
357 v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
358 v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
362 static inline void gmx_simdcall
363 gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
365 __m128d t1, t2, t3, t4;
368 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
369 _mm_store_si128(reinterpret_cast<__m128i*>(ioffset), offset.simdInternal_);
371 t1 = _mm_loadu_pd(base + align * ioffset[0]);
372 t2 = _mm_loadu_pd(base + align * ioffset[1]);
373 t3 = _mm_loadu_pd(base + align * ioffset[2]);
374 t4 = _mm_loadu_pd(base + align * ioffset[3]);
376 tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
377 tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
379 v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
380 v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
383 static inline double gmx_simdcall
384 reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
389 assert(std::size_t(m) % 32 == 0);
391 t0 = _mm256_hadd_pd(v0.simdInternal_, v1.simdInternal_);
392 t1 = _mm256_hadd_pd(v2.simdInternal_, v3.simdInternal_);
393 t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
394 t0 = _mm256_add_pd(t0, t2);
395 t1 = _mm256_add_pd(t1, t2);
396 t0 = _mm256_blend_pd(t0, t1, 0b1100);
398 t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
399 _mm256_store_pd(m, t1);
401 t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
402 a0 = _mm256_castpd256_pd128(t0);
403 a1 = _mm256_extractf128_pd(t0, 0x1);
404 a0 = _mm_add_sd(a0, a1);
406 return *reinterpret_cast<double*>(&a0);
411 #endif // GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H