2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
44 #include <immintrin.h>
46 #include "gromacs/utility/basedefinitions.h"
48 #include "impl_x86_mic_simd_double.h"
53 // On MIC it is better to use scatter operations, so we define the load routines
54 // that use a SIMD offset variable first.
57 static inline void gmx_simdcall
58 gatherLoadBySimdIntTranspose(const double * base,
59 SimdDInt32 simdoffset,
65 assert((size_t)base % 32 == 0);
66 assert(align % 4 == 0);
68 // All instructions might be latency ~4 on MIC, so we use shifts where we
69 // only need a single instruction (since the shift parameter is an immediate),
70 // but multiplication otherwise.
73 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
77 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
81 simdoffset = simdoffset * SimdDInt32(align);
84 v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
85 v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
86 v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
87 v3->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+3, sizeof(double));
91 static inline void gmx_simdcall
92 gatherLoadUBySimdIntTranspose(const double * base,
93 SimdDInt32 simdoffset,
97 // All instructions might be latency ~4 on MIC, so we use shifts where we
98 // only need a single instruction (since the shift parameter is an immediate),
99 // but multiplication otherwise.
102 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 1);
106 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
110 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
114 simdoffset = simdoffset * SimdDInt32(align);
117 v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
118 v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
122 static inline void gmx_simdcall
123 gatherLoadBySimdIntTranspose(const double * base,
124 SimdDInt32 simdoffset,
128 assert(std::size_t(base) % 16 == 0);
129 assert(align % 2 == 0);
130 gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
137 static inline void gmx_simdcall
138 gatherLoadTranspose(const double * base,
139 const std::int32_t offset[],
145 gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1, v2, v3);
149 static inline void gmx_simdcall
150 gatherLoadTranspose(const double * base,
151 const std::int32_t offset[],
155 gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1);
158 static const int c_simdBestPairAlignmentDouble = 2;
161 static inline void gmx_simdcall
162 gatherLoadUTranspose(const double * base,
163 const std::int32_t offset[],
168 SimdDInt32 simdoffset;
170 assert(std::size_t(offset) % 32 == 0);
172 simdoffset = simdLoad(offset, SimdDInt32Tag());
174 // All instructions might be latency ~4 on MIC, so we use shifts where we
175 // only need a single instruction (since the shift parameter is an immediate),
176 // but multiplication otherwise.
179 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
183 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
187 simdoffset = simdoffset * SimdDInt32(align);
190 v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
191 v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
192 v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
196 static inline void gmx_simdcall
197 transposeScatterStoreU(double * base,
198 const std::int32_t offset[],
203 SimdDInt32 simdoffset;
205 assert(std::size_t(offset) % 32 == 0);
207 simdoffset = simdLoad(offset, SimdDInt32Tag());
209 // All instructions might be latency ~4 on MIC, so we use shifts where we
210 // only need a single instruction (since the shift parameter is an immediate),
211 // but multiplication otherwise.
214 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
218 simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
222 simdoffset = simdoffset * SimdDInt32(align);
225 _mm512_i32loscatter_pd(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(double));
226 _mm512_i32loscatter_pd(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double));
227 _mm512_i32loscatter_pd(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double));
231 static inline void gmx_simdcall
232 transposeScatterIncrU(double * base,
233 const std::int32_t offset[],
238 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata0[GMX_SIMD_DOUBLE_WIDTH];
239 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata1[GMX_SIMD_DOUBLE_WIDTH];
240 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata2[GMX_SIMD_DOUBLE_WIDTH];
246 for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
248 base[ align * offset[i] + 0] += rdata0[i];
249 base[ align * offset[i] + 1] += rdata1[i];
250 base[ align * offset[i] + 2] += rdata2[i];
255 static inline void gmx_simdcall
256 transposeScatterDecrU(double * base,
257 const std::int32_t offset[],
262 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata0[GMX_SIMD_DOUBLE_WIDTH];
263 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata1[GMX_SIMD_DOUBLE_WIDTH];
264 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata2[GMX_SIMD_DOUBLE_WIDTH];
270 for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
272 base[ align * offset[i] + 0] -= rdata0[i];
273 base[ align * offset[i] + 1] -= rdata1[i];
274 base[ align * offset[i] + 2] -= rdata2[i];
278 static inline void gmx_simdcall
279 expandScalarsToTriplets(SimdDouble scalar,
280 SimdDouble * triplets0,
281 SimdDouble * triplets1,
282 SimdDouble * triplets2)
284 triplets0->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
285 _mm512_castpd_si512(scalar.simdInternal_)));
286 triplets1->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
287 _mm512_castpd_si512(scalar.simdInternal_)));
288 triplets2->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
289 _mm512_castpd_si512(scalar.simdInternal_)));
293 static inline double gmx_simdcall
294 reduceIncr4ReturnSum(double * m,
301 __m512d t0, t1, t2, t3;
303 assert(std::size_t(m) % 32 == 0);
305 t0 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0.simdInternal_, v2.simdInternal_), _MM_SWIZ_REG_BADC);
306 t2 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2.simdInternal_, v0.simdInternal_);
307 t1 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1.simdInternal_, v3.simdInternal_), _MM_SWIZ_REG_BADC);
308 t3 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3.simdInternal_, v1.simdInternal_);
309 t0 = _mm512_add_pd(t0, t2);
310 t1 = _mm512_add_pd(t1, t3);
312 t2 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0, t1), _MM_SWIZ_REG_CDAB);
313 t3 = _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1, t0);
314 t2 = _mm512_add_pd(t2, t3);
316 t2 = _mm512_add_pd(t2, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2), _MM_PERM_BADC)));
318 t0 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
319 t0 = _mm512_add_pd(t0, t2);
320 _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t0);
322 t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_BADC));
323 t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_CDAB));
325 _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x01), t2);
329 static inline SimdDouble gmx_simdcall
330 loadDualHsimd(const double * m0,
333 assert(std::size_t(m0) % 32 == 0);
334 assert(std::size_t(m1) % 32 == 0);
336 return _mm512_mask_extload_pd(_mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
337 m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
340 static inline SimdDouble gmx_simdcall
341 loadDuplicateHsimd(const double * m)
343 assert(std::size_t(m) % 32 == 0);
345 return _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
348 static inline SimdDouble gmx_simdcall
349 loadU1DualHsimd(const double * m)
351 return _mm512_mask_extload_pd(_mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
352 m+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
356 static inline void gmx_simdcall
357 storeDualHsimd(double * m0,
361 assert(std::size_t(m0) % 32 == 0);
362 assert(std::size_t(m1) % 32 == 0);
364 _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), a.simdInternal_);
365 _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), a.simdInternal_);
368 static inline void gmx_simdcall
369 incrDualHsimd(double * m0,
373 assert(std::size_t(m0) % 32 == 0);
374 assert(std::size_t(m1) % 32 == 0);
379 x = _mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
380 x = _mm512_add_pd(x, a.simdInternal_);
381 _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), x);
384 x = _mm512_extload_pd(m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
385 x = _mm512_add_pd(x, a.simdInternal_);
386 _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
389 static inline void gmx_simdcall
390 decrHsimd(double * m,
395 assert(std::size_t(m) % 32 == 0);
397 t = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
398 a.simdInternal_ = _mm512_add_pd(a.simdInternal_, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
399 t = _mm512_sub_pd(t, a.simdInternal_);
400 _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
405 static inline void gmx_simdcall
406 gatherLoadTransposeHsimd(const double * base0,
407 const double * base1,
408 const std::int32_t offset[],
412 __m512i idx0, idx1, idx;
415 assert(std::size_t(offset) % 16 == 0);
416 assert(std::size_t(base0) % 16 == 0);
417 assert(std::size_t(base1) % 16 == 0);
418 assert(std::size_t(align) % 2 == 0);
420 idx0 = _mm512_extload_epi32(offset, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
422 idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
423 idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
425 idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0x00F0), idx1, _MM_PERM_AAAA);
427 tmp1 = _mm512_i32logather_pd(idx, base0, sizeof(double));
428 tmp2 = _mm512_i32logather_pd(idx, base1, sizeof(double));
430 v0->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp1), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2), _MM_PERM_BABA));
431 v1->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp2), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1), _MM_PERM_DCDC));
434 static inline double gmx_simdcall
435 reduceIncr4ReturnSumHsimd(double * m,
442 assert(std::size_t(m) % 32 == 0);
444 t0 = _mm512_add_pd(v0.simdInternal_, _mm512_swizzle_pd(v0.simdInternal_, _MM_SWIZ_REG_BADC));
445 t0 = _mm512_mask_add_pd(t0, _mm512_int2mask(0xCC), v1.simdInternal_, _mm512_swizzle_pd(v1.simdInternal_, _MM_SWIZ_REG_BADC));
446 t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
447 t0 = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(t0), _mm512_int2mask(0xCCCC),
448 _mm512_castpd_ps(t0), _MM_PERM_DCDC));
450 t1 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
451 t1 = _mm512_add_pd(t1, t0);
452 _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t1);
454 t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_BADC));
455 t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
457 _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x03), t0);
463 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H