Add cool quote
[alexxy/gromacs.git] / src / gromacs / simd / impl_x86_mic / impl_x86_mic_util_float.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
38
39 #include "config.h"
40
41 #include <cassert>
42 #include <cstdint>
43
44 #include <immintrin.h>
45
46 #include "gromacs/utility/basedefinitions.h"
47
48 #include "impl_x86_mic_simd_float.h"
49
50 namespace gmx
51 {
52
53 // On MIC it is better to use scatter operations, so we define the load routines
54 // that use a SIMD offset variable first.
55
56 template <int align>
57 static inline void gmx_simdcall
58 gatherLoadBySimdIntTranspose(const float *  base,
59                              SimdFInt32     simdoffset,
60                              SimdFloat *    v0,
61                              SimdFloat *    v1,
62                              SimdFloat *    v2,
63                              SimdFloat *    v3)
64 {
65     assert(std::size_t(base) % 16 == 0);
66     assert(align % 4 == 0);
67
68     // All instructions might be latency ~4 on MIC, so we use shifts where we
69     // only need a single instruction (since the shift parameter is an immediate),
70     // but multiplication otherwise.
71     if (align == 4)
72     {
73         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
74     }
75     else if (align == 8)
76     {
77         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
78     }
79     else
80     {
81         simdoffset = simdoffset * SimdFInt32(align);
82     }
83
84     v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
85     v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
86     v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
87     v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+3, sizeof(float));
88 }
89
90 template <int align>
91 static inline void gmx_simdcall
92 gatherLoadUBySimdIntTranspose(const float *  base,
93                               SimdFInt32     simdoffset,
94                               SimdFloat *    v0,
95                               SimdFloat *    v1)
96 {
97     // All instructions might be latency ~4 on MIC, so we use shifts where we
98     // only need a single instruction (since the shift parameter is an immediate),
99     // but multiplication otherwise.
100     // For align == 2 we can merge the constant into the scale parameter,
101     // which can take constants up to 8 in total.
102     if (align == 2)
103     {
104         v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   align * sizeof(float));
105         v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, align * sizeof(float));
106     }
107     else
108     {
109         if (align == 4)
110         {
111             simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
112         }
113         else if (align == 8)
114         {
115             simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
116         }
117         else
118         {
119             simdoffset = simdoffset * SimdFInt32(align);
120         }
121         v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
122         v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
123     }
124 }
125
126 template <int align>
127 static inline void gmx_simdcall
128 gatherLoadBySimdIntTranspose(const float *   base,
129                              SimdFInt32      simdoffset,
130                              SimdFloat *     v0,
131                              SimdFloat *     v1)
132 {
133     assert(std::size_t(base) % 8 == 0);
134     assert(align % 2 == 0);
135     gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
136 }
137
138 template <int align>
139 static inline void gmx_simdcall
140 gatherLoadTranspose(const float *        base,
141                     const std::int32_t   offset[],
142                     SimdFloat *          v0,
143                     SimdFloat *          v1,
144                     SimdFloat *          v2,
145                     SimdFloat *          v3)
146 {
147     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1, v2, v3);
148 }
149
150 template <int align>
151 static inline void gmx_simdcall
152 gatherLoadTranspose(const float *        base,
153                     const std::int32_t   offset[],
154                     SimdFloat *          v0,
155                     SimdFloat *          v1)
156 {
157     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1);
158 }
159
160 static const int c_simdBestPairAlignmentFloat = 2;
161
162 template <int align>
163 static inline void gmx_simdcall
164 gatherLoadUTranspose(const float *        base,
165                      const std::int32_t   offset[],
166                      SimdFloat *          v0,
167                      SimdFloat *          v1,
168                      SimdFloat *          v2)
169 {
170     SimdFInt32 simdoffset;
171
172     assert(std::size_t(offset) % 64 == 0);
173
174     simdoffset = simdLoad(offset, SimdFInt32Tag());
175
176     // All instructions might be latency ~4 on MIC, so we use shifts where we
177     // only need a single instruction (since the shift parameter is an immediate),
178     // but multiplication otherwise.
179     if (align == 4)
180     {
181         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
182     }
183     else if (align == 8)
184     {
185         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
186     }
187     else
188     {
189         simdoffset = simdoffset * SimdFInt32(align);
190     }
191
192     v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
193     v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
194     v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
195 }
196
197
198 template <int align>
199 static inline void gmx_simdcall
200 transposeScatterStoreU(float *              base,
201                        const std::int32_t   offset[],
202                        SimdFloat            v0,
203                        SimdFloat            v1,
204                        SimdFloat            v2)
205 {
206     SimdFInt32 simdoffset;
207
208     assert(std::size_t(offset) % 64 == 0);
209
210     simdoffset = simdLoad(offset, SimdFInt32Tag());
211
212     // All instructions might be latency ~4 on MIC, so we use shifts where we
213     // only need a single instruction (since the shift parameter is an immediate),
214     // but multiplication otherwise.
215     if (align == 4)
216     {
217         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
218     }
219     else if (align == 8)
220     {
221         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
222     }
223     else
224     {
225         simdoffset = simdoffset * SimdFInt32(align);
226     }
227
228     _mm512_i32scatter_ps(base,   simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
229     _mm512_i32scatter_ps(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
230     _mm512_i32scatter_ps(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
231 }
232
233
234 template <int align>
235 static inline void gmx_simdcall
236 transposeScatterIncrU(float *              base,
237                       const std::int32_t   offset[],
238                       SimdFloat            v0,
239                       SimdFloat            v1,
240                       SimdFloat            v2)
241 {
242     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata0[GMX_SIMD_FLOAT_WIDTH];
243     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata1[GMX_SIMD_FLOAT_WIDTH];
244     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata2[GMX_SIMD_FLOAT_WIDTH];
245
246     store(rdata0, v0);
247     store(rdata1, v1);
248     store(rdata2, v2);
249
250     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
251     {
252         base[ align * offset[i] + 0] += rdata0[i];
253         base[ align * offset[i] + 1] += rdata1[i];
254         base[ align * offset[i] + 2] += rdata2[i];
255     }
256 }
257
258 template <int align>
259 static inline void gmx_simdcall
260 transposeScatterDecrU(float *              base,
261                       const std::int32_t   offset[],
262                       SimdFloat            v0,
263                       SimdFloat            v1,
264                       SimdFloat            v2)
265 {
266     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata0[GMX_SIMD_FLOAT_WIDTH];
267     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata1[GMX_SIMD_FLOAT_WIDTH];
268     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata2[GMX_SIMD_FLOAT_WIDTH];
269
270     store(rdata0, v0);
271     store(rdata1, v1);
272     store(rdata2, v2);
273
274     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
275     {
276         base[ align * offset[i] + 0] -= rdata0[i];
277         base[ align * offset[i] + 1] -= rdata1[i];
278         base[ align * offset[i] + 2] -= rdata2[i];
279     }
280 }
281
282 static inline void gmx_simdcall
283 expandScalarsToTriplets(SimdFloat    scalar,
284                         SimdFloat *  triplets0,
285                         SimdFloat *  triplets1,
286                         SimdFloat *  triplets2)
287 {
288     triplets0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
289                                                                            _mm512_castps_si512(scalar.simdInternal_)));
290     triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
291                                                                            _mm512_castps_si512(scalar.simdInternal_)));
292     triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
293                                                                            _mm512_castps_si512(scalar.simdInternal_)));
294 }
295
296
297 static inline float gmx_simdcall
298 reduceIncr4ReturnSum(float *    m,
299                      SimdFloat  v0,
300                      SimdFloat  v1,
301                      SimdFloat  v2,
302                      SimdFloat  v3)
303 {
304     float  f;
305     __m512 t0, t1, t2, t3;
306
307     assert(std::size_t(m) % 16 == 0);
308
309     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
310     t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v2.simdInternal_, _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
311     t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
312     t1 = _mm512_mask_add_ps(t1, _mm512_int2mask(0xCCCC), v3.simdInternal_, _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
313     t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
314     t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
315
316     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
317     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
318
319     t0 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
320     t0 = _mm512_add_ps(t0, t2);
321     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
322
323     t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_BADC));
324     t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_CDAB));
325
326     _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t2);
327     return f;
328 }
329
330 static inline SimdFloat gmx_simdcall
331 loadDualHsimd(const float * m0,
332               const float * m1)
333 {
334     assert(std::size_t(m0) % 32 == 0);
335     assert(std::size_t(m1) % 32 == 0);
336
337     return _mm512_castpd_ps(_mm512_mask_extload_pd(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
338                                                    _mm512_int2mask(0xF0), reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
339 }
340
341 static inline SimdFloat gmx_simdcall
342 loadDuplicateHsimd(const float * m)
343 {
344     assert(std::size_t(m) % 32 == 0);
345
346     return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
347 }
348
349 static inline SimdFloat gmx_simdcall
350 loadU1DualHsimd(const float * m)
351 {
352     return _mm512_mask_extload_ps(_mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), _mm512_int2mask(0xFF00),
353                                   m+1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
354 }
355
356
357 static inline void gmx_simdcall
358 storeDualHsimd(float *     m0,
359                float *     m1,
360                SimdFloat   a)
361 {
362     __m512 t0;
363
364     assert(std::size_t(m0) % 32 == 0);
365     assert(std::size_t(m1) % 32 == 0);
366
367     _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), a.simdInternal_);
368     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
369 }
370
371 static inline void gmx_simdcall
372 incrDualHsimd(float *     m0,
373               float *     m1,
374               SimdFloat   a)
375 {
376     assert(std::size_t(m0) % 32 == 0);
377     assert(std::size_t(m1) % 32 == 0);
378
379     __m512 x;
380
381     // Update lower half
382     x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
383     x = _mm512_add_ps(x, a.simdInternal_);
384     _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
385
386     // Update upper half
387     x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
388     x = _mm512_add_ps(x, a.simdInternal_);
389     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
390 }
391
392 static inline void gmx_simdcall
393 decrHsimd(float *    m,
394           SimdFloat  a)
395 {
396     __m512 t;
397
398     assert(std::size_t(m) % 32 == 0);
399
400     t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
401     a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
402     t = _mm512_sub_ps(t, a.simdInternal_);
403     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
404 }
405
406
407 template <int align>
408 static inline void gmx_simdcall
409 gatherLoadTransposeHsimd(const float *        base0,
410                          const float *        base1,
411                          const std::int32_t   offset[],
412                          SimdFloat *          v0,
413                          SimdFloat *          v1)
414 {
415     __m512i idx0, idx1, idx;
416     __m512  tmp1, tmp2;
417
418     assert(std::size_t(offset) % 32 == 0);
419     assert(std::size_t(base0) % 8 == 0);
420     assert(std::size_t(base1) % 8 == 0);
421     assert(std::size_t(align) % 2 == 0);
422
423     idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset);
424
425     idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
426     idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
427
428     idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0xFF00), idx1, _MM_PERM_BABA);
429
430     tmp1 = _mm512_i32gather_ps(idx, base0, sizeof(float));
431     tmp2 = _mm512_i32gather_ps(idx, base1, sizeof(float));
432
433     v0->simdInternal_ = _mm512_mask_permute4f128_ps(tmp1, _mm512_int2mask(0xFF00), tmp2, _MM_PERM_BABA);
434     v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
435 }
436
437 static inline float gmx_simdcall
438 reduceIncr4ReturnSumHsimd(float *     m,
439                           SimdFloat   v0,
440                           SimdFloat   v1)
441 {
442     float  f;
443     __m512 t0, t1;
444
445     assert(std::size_t(m) % 32 == 0);
446
447     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
448     t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
449     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
450     t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
451     t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
452     t1 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
453     t1 = _mm512_add_ps(t1, t0);
454     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
455
456     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_BADC));
457     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
458
459     _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t0);
460     return f;
461 }
462
463 }      // namespace gmx
464
465 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H