2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
45 #include <immintrin.h>
47 #include "gromacs/utility/basedefinitions.h"
49 #include "impl_x86_avx_256_simd_double.h"
54 // Internal utility function: Full 4x4 transpose of __m256d
55 static inline void gmx_simdcall
56 avx256Transpose4By4(__m256d * v0,
61 __m256d t1 = _mm256_unpacklo_pd(*v0, *v1);
62 __m256d t2 = _mm256_unpackhi_pd(*v0, *v1);
63 __m256d t3 = _mm256_unpacklo_pd(*v2, *v3);
64 __m256d t4 = _mm256_unpackhi_pd(*v2, *v3);
65 *v0 = _mm256_permute2f128_pd(t1, t3, 0x20);
66 *v1 = _mm256_permute2f128_pd(t2, t4, 0x20);
67 *v2 = _mm256_permute2f128_pd(t1, t3, 0x31);
68 *v3 = _mm256_permute2f128_pd(t2, t4, 0x31);
72 static inline void gmx_simdcall
73 gatherLoadTranspose(const double * base,
74 const std::int32_t offset[],
80 assert(std::size_t(offset) % 16 == 0);
81 assert(std::size_t(base) % 32 == 0);
82 assert(align % 4 == 0);
84 v0->simdInternal_ = _mm256_load_pd( base + align * offset[0] );
85 v1->simdInternal_ = _mm256_load_pd( base + align * offset[1] );
86 v2->simdInternal_ = _mm256_load_pd( base + align * offset[2] );
87 v3->simdInternal_ = _mm256_load_pd( base + align * offset[3] );
88 avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
92 static inline void gmx_simdcall
93 gatherLoadTranspose(const double * base,
94 const std::int32_t offset[],
98 __m128d t1, t2, t3, t4;
101 assert(std::size_t(offset) % 16 == 0);
102 assert(std::size_t(base) % 16 == 0);
103 assert(align % 2 == 0);
105 t1 = _mm_load_pd( base + align * offset[0] );
106 t2 = _mm_load_pd( base + align * offset[1] );
107 t3 = _mm_load_pd( base + align * offset[2] );
108 t4 = _mm_load_pd( base + align * offset[3] );
109 tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
110 tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
112 v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
113 v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
116 static const int c_simdBestPairAlignmentDouble = 2;
119 static inline void gmx_simdcall
120 gatherLoadUTranspose(const double * base,
121 const std::int32_t offset[],
126 assert(std::size_t(offset) % 16 == 0);
128 __m256d t1, t2, t3, t4, t5, t6, t7, t8;
131 t1 = _mm256_load_pd(base + align * offset[0]);
132 t2 = _mm256_load_pd(base + align * offset[1]);
133 t3 = _mm256_load_pd(base + align * offset[2]);
134 t4 = _mm256_load_pd(base + align * offset[3]);
138 t1 = _mm256_loadu_pd(base + align * offset[0]);
139 t2 = _mm256_loadu_pd(base + align * offset[1]);
140 t3 = _mm256_loadu_pd(base + align * offset[2]);
141 t4 = _mm256_loadu_pd(base + align * offset[3]);
143 t5 = _mm256_unpacklo_pd(t1, t2);
144 t6 = _mm256_unpackhi_pd(t1, t2);
145 t7 = _mm256_unpacklo_pd(t3, t4);
146 t8 = _mm256_unpackhi_pd(t3, t4);
147 v0->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x20);
148 v1->simdInternal_ = _mm256_permute2f128_pd(t6, t8, 0x20);
149 v2->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x31);
153 static inline void gmx_simdcall
154 transposeScatterStoreU(double * base,
155 const std::int32_t offset[],
163 assert(std::size_t(offset) % 16 == 0);
169 t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
170 t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
171 t2 = _mm256_unpackhi_pd(v2.simdInternal_, v2.simdInternal_); // z1 z1 | z3 z3
173 _mm_storeu_pd(base + align * offset[0], _mm256_castpd256_pd128(t0));
174 _mm_storeu_pd(base + align * offset[1], _mm256_castpd256_pd128(t1));
175 _mm_storeu_pd(base + align * offset[2], _mm256_extractf128_pd(t0, 0x1));
176 _mm_storeu_pd(base + align * offset[3], _mm256_extractf128_pd(t1, 0x1));
177 _mm_store_sd(base + align * offset[0] + 2, _mm256_castpd256_pd128(v2.simdInternal_));
178 _mm_store_sd(base + align * offset[1] + 2, _mm256_castpd256_pd128(t2));
179 _mm_store_sd(base + align * offset[2] + 2, _mm256_extractf128_pd(v2.simdInternal_, 0x1));
180 _mm_store_sd(base + align * offset[3] + 2, _mm256_extractf128_pd(t2, 0x1));
184 static inline void gmx_simdcall
185 transposeScatterIncrU(double * base,
186 const std::int32_t offset[],
194 assert(std::size_t(offset) % 16 == 0);
198 // we can use aligned load/store
199 t0 = _mm256_setzero_pd();
200 avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
201 _mm256_store_pd(base + align * offset[0], _mm256_add_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
202 _mm256_store_pd(base + align * offset[1], _mm256_add_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
203 _mm256_store_pd(base + align * offset[2], _mm256_add_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
204 _mm256_store_pd(base + align * offset[3], _mm256_add_pd(_mm256_load_pd(base + align * offset[3]), t0));
212 t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
213 t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
214 t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1); // z2 z3
216 tA = _mm_loadu_pd(base + align * offset[0]);
217 tB = _mm_load_sd(base + align * offset[0] + 2);
218 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t0));
219 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
220 _mm_storeu_pd(base + align * offset[0], tA);
221 _mm_store_sd(base + align * offset[0] + 2, tB);
223 tA = _mm_loadu_pd(base + align * offset[1]);
224 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
225 tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t1));
226 tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
227 _mm_storeu_pd(base + align * offset[1], tA);
228 _mm_storeh_pd(base + align * offset[1] + 2, tB);
230 tA = _mm_loadu_pd(base + align * offset[2]);
231 tB = _mm_load_sd(base + align * offset[2] + 2);
232 tA = _mm_add_pd(tA, _mm256_extractf128_pd(t0, 0x1));
233 tB = _mm_add_pd(tB, t2);
234 _mm_storeu_pd(base + align * offset[2], tA);
235 _mm_store_sd(base + align * offset[2] + 2, tB);
237 tA = _mm_loadu_pd(base + align * offset[3]);
238 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
239 tA = _mm_add_pd(tA, _mm256_extractf128_pd(t1, 0x1));
240 tB = _mm_add_pd(tB, t2);
241 _mm_storeu_pd(base + align * offset[3], tA);
242 _mm_storeh_pd(base + align * offset[3] + 2, tB);
246 static inline void gmx_simdcall
247 transposeScatterDecrU(double * base,
248 const std::int32_t offset[],
256 assert(std::size_t(offset) % 16 == 0);
260 // we can use aligned load/store
261 t0 = _mm256_setzero_pd();
262 avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
263 _mm256_store_pd(base + align * offset[0], _mm256_sub_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
264 _mm256_store_pd(base + align * offset[1], _mm256_sub_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
265 _mm256_store_pd(base + align * offset[2], _mm256_sub_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
266 _mm256_store_pd(base + align * offset[3], _mm256_sub_pd(_mm256_load_pd(base + align * offset[3]), t0));
274 t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
275 t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
276 t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1); // z2 z3
278 tA = _mm_loadu_pd(base + align * offset[0]);
279 tB = _mm_load_sd(base + align * offset[0] + 2);
280 tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t0));
281 tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
282 _mm_storeu_pd(base + align * offset[0], tA);
283 _mm_store_sd(base + align * offset[0] + 2, tB);
285 tA = _mm_loadu_pd(base + align * offset[1]);
286 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
287 tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t1));
288 tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
289 _mm_storeu_pd(base + align * offset[1], tA);
290 _mm_storeh_pd(base + align * offset[1] + 2, tB);
292 tA = _mm_loadu_pd(base + align * offset[2]);
293 tB = _mm_load_sd(base + align * offset[2] + 2);
294 tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t0, 0x1));
295 tB = _mm_sub_pd(tB, t2);
296 _mm_storeu_pd(base + align * offset[2], tA);
297 _mm_store_sd(base + align * offset[2] + 2, tB);
299 tA = _mm_loadu_pd(base + align * offset[3]);
300 tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
301 tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t1, 0x1));
302 tB = _mm_sub_pd(tB, t2);
303 _mm_storeu_pd(base + align * offset[3], tA);
304 _mm_storeh_pd(base + align * offset[3] + 2, tB);
308 static inline void gmx_simdcall
309 expandScalarsToTriplets(SimdDouble scalar,
310 SimdDouble * triplets0,
311 SimdDouble * triplets1,
312 SimdDouble * triplets2)
314 __m256d t0 = _mm256_permute2f128_pd(scalar.simdInternal_, scalar.simdInternal_, 0x21);
315 __m256d t1 = _mm256_permute_pd(scalar.simdInternal_, 0b0000);
316 __m256d t2 = _mm256_permute_pd(scalar.simdInternal_, 0b1111);
317 triplets0->simdInternal_ = _mm256_blend_pd(t1, t0, 0b1100);
318 triplets1->simdInternal_ = _mm256_blend_pd(t2, t1, 0b1100);
319 triplets2->simdInternal_ = _mm256_blend_pd(t0, t2, 0b1100);
323 static inline void gmx_simdcall
324 gatherLoadBySimdIntTranspose(const double * base,
331 assert(std::size_t(base) % 32 == 0);
332 assert(align % 4 == 0);
334 GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH) ioffset[GMX_SIMD_DINT32_WIDTH];
335 _mm_store_si128( reinterpret_cast<__m128i *>(ioffset), offset.simdInternal_);
337 v0->simdInternal_ = _mm256_load_pd(base + align * ioffset[0]);
338 v1->simdInternal_ = _mm256_load_pd(base + align * ioffset[1]);
339 v2->simdInternal_ = _mm256_load_pd(base + align * ioffset[2]);
340 v3->simdInternal_ = _mm256_load_pd(base + align * ioffset[3]);
342 avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
346 static inline void gmx_simdcall
347 gatherLoadBySimdIntTranspose(const double * base,
352 __m128d t1, t2, t3, t4;
355 assert(std::size_t(base) % 16 == 0);
356 assert(align % 2 == 0);
358 GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH) ioffset[GMX_SIMD_DINT32_WIDTH];
359 _mm_store_si128( reinterpret_cast<__m128i *>(ioffset), offset.simdInternal_);
361 t1 = _mm_load_pd(base + align * ioffset[0]);
362 t2 = _mm_load_pd(base + align * ioffset[1]);
363 t3 = _mm_load_pd(base + align * ioffset[2]);
364 t4 = _mm_load_pd(base + align * ioffset[3]);
366 tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
367 tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
368 v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
369 v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
373 static inline void gmx_simdcall
374 gatherLoadUBySimdIntTranspose(const double * base,
379 __m128d t1, t2, t3, t4;
382 GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH) ioffset[GMX_SIMD_DINT32_WIDTH];
383 _mm_store_si128( reinterpret_cast<__m128i *>(ioffset), offset.simdInternal_);
385 t1 = _mm_loadu_pd(base + align * ioffset[0]);
386 t2 = _mm_loadu_pd(base + align * ioffset[1]);
387 t3 = _mm_loadu_pd(base + align * ioffset[2]);
388 t4 = _mm_loadu_pd(base + align * ioffset[3]);
390 tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
391 tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
393 v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
394 v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
397 static inline double gmx_simdcall
398 reduceIncr4ReturnSum(double * m,
407 assert(std::size_t(m) % 32 == 0);
409 t0 = _mm256_hadd_pd(v0.simdInternal_, v1.simdInternal_);
410 t1 = _mm256_hadd_pd(v2.simdInternal_, v3.simdInternal_);
411 t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
412 t0 = _mm256_add_pd(t0, t2);
413 t1 = _mm256_add_pd(t1, t2);
414 t0 = _mm256_blend_pd(t0, t1, 0b1100);
416 t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
417 _mm256_store_pd(m, t1);
419 t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101 ));
420 a0 = _mm256_castpd256_pd128(t0);
421 a1 = _mm256_extractf128_pd(t0, 0x1);
422 a0 = _mm_add_sd(a0, a1);
424 return *reinterpret_cast<double *>(&a0);
429 #endif // GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H