Add cool quote
[alexxy/gromacs.git] / src / gromacs / simd / impl_x86_mic / impl_x86_mic_util_double.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
38
39 #include "config.h"
40
41 #include <cassert>
42 #include <cstdint>
43
44 #include <immintrin.h>
45
46 #include "gromacs/utility/basedefinitions.h"
47
48 #include "impl_x86_mic_simd_double.h"
49
50 namespace gmx
51 {
52
53 // On MIC it is better to use scatter operations, so we define the load routines
54 // that use a SIMD offset variable first.
55
56 template <int align>
57 static inline void gmx_simdcall
58 gatherLoadBySimdIntTranspose(const double *  base,
59                              SimdDInt32      simdoffset,
60                              SimdDouble *    v0,
61                              SimdDouble *    v1,
62                              SimdDouble *    v2,
63                              SimdDouble *    v3)
64 {
65     assert((size_t)base % 32 == 0);
66     assert(align % 4 == 0);
67
68     // All instructions might be latency ~4 on MIC, so we use shifts where we
69     // only need a single instruction (since the shift parameter is an immediate),
70     // but multiplication otherwise.
71     if (align == 4)
72     {
73         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
74     }
75     else if (align == 8)
76     {
77         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
78     }
79     else
80     {
81         simdoffset = simdoffset * SimdDInt32(align);
82     }
83
84     v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
85     v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
86     v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
87     v3->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+3, sizeof(double));
88 }
89
90 template <int align>
91 static inline void gmx_simdcall
92 gatherLoadUBySimdIntTranspose(const double *  base,
93                               SimdDInt32      simdoffset,
94                               SimdDouble *    v0,
95                               SimdDouble *    v1)
96 {
97     // All instructions might be latency ~4 on MIC, so we use shifts where we
98     // only need a single instruction (since the shift parameter is an immediate),
99     // but multiplication otherwise.
100     if (align == 2)
101     {
102         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 1);
103     }
104     else if (align == 4)
105     {
106         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
107     }
108     else if (align == 8)
109     {
110         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
111     }
112     else
113     {
114         simdoffset = simdoffset * SimdDInt32(align);
115     }
116
117     v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
118     v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
119 }
120
121 template <int align>
122 static inline void gmx_simdcall
123 gatherLoadBySimdIntTranspose(const double *    base,
124                              SimdDInt32        simdoffset,
125                              SimdDouble *      v0,
126                              SimdDouble *      v1)
127 {
128     assert(std::size_t(base) % 16 == 0);
129     assert(align % 2 == 0);
130     gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
131 }
132
133
134
135
136 template <int align>
137 static inline void gmx_simdcall
138 gatherLoadTranspose(const double *        base,
139                     const std::int32_t    offset[],
140                     SimdDouble *          v0,
141                     SimdDouble *          v1,
142                     SimdDouble *          v2,
143                     SimdDouble *          v3)
144 {
145     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1, v2, v3);
146 }
147
148 template <int align>
149 static inline void gmx_simdcall
150 gatherLoadTranspose(const double *        base,
151                     const std::int32_t    offset[],
152                     SimdDouble *          v0,
153                     SimdDouble *          v1)
154 {
155     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1);
156 }
157
158 static const int c_simdBestPairAlignmentDouble = 2;
159
160 template <int align>
161 static inline void gmx_simdcall
162 gatherLoadUTranspose(const double *        base,
163                      const std::int32_t    offset[],
164                      SimdDouble *          v0,
165                      SimdDouble *          v1,
166                      SimdDouble *          v2)
167 {
168     SimdDInt32 simdoffset;
169
170     assert(std::size_t(offset) % 32 == 0);
171
172     simdoffset = simdLoad(offset, SimdDInt32Tag());
173
174     // All instructions might be latency ~4 on MIC, so we use shifts where we
175     // only need a single instruction (since the shift parameter is an immediate),
176     // but multiplication otherwise.
177     if (align == 4)
178     {
179         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
180     }
181     else if (align == 8)
182     {
183         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
184     }
185     else
186     {
187         simdoffset = simdoffset * SimdDInt32(align);
188     }
189
190     v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
191     v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
192     v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
193 }
194
195 template <int align>
196 static inline void gmx_simdcall
197 transposeScatterStoreU(double *            base,
198                        const std::int32_t  offset[],
199                        SimdDouble          v0,
200                        SimdDouble          v1,
201                        SimdDouble          v2)
202 {
203     SimdDInt32 simdoffset;
204
205     assert(std::size_t(offset) % 32 == 0);
206
207     simdoffset = simdLoad(offset, SimdDInt32Tag());
208
209     // All instructions might be latency ~4 on MIC, so we use shifts where we
210     // only need a single instruction (since the shift parameter is an immediate),
211     // but multiplication otherwise.
212     if (align == 4)
213     {
214         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
215     }
216     else if (align == 8)
217     {
218         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
219     }
220     else
221     {
222         simdoffset = simdoffset * SimdDInt32(align);
223     }
224
225     _mm512_i32loscatter_pd(base,   simdoffset.simdInternal_, v0.simdInternal_, sizeof(double));
226     _mm512_i32loscatter_pd(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double));
227     _mm512_i32loscatter_pd(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double));
228 }
229
230 template <int align>
231 static inline void gmx_simdcall
232 transposeScatterIncrU(double *            base,
233                       const std::int32_t  offset[],
234                       SimdDouble          v0,
235                       SimdDouble          v1,
236                       SimdDouble          v2)
237 {
238     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata0[GMX_SIMD_DOUBLE_WIDTH];
239     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata1[GMX_SIMD_DOUBLE_WIDTH];
240     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata2[GMX_SIMD_DOUBLE_WIDTH];
241
242     store(rdata0, v0);
243     store(rdata1, v1);
244     store(rdata2, v2);
245
246     for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
247     {
248         base[ align * offset[i] + 0] += rdata0[i];
249         base[ align * offset[i] + 1] += rdata1[i];
250         base[ align * offset[i] + 2] += rdata2[i];
251     }
252 }
253
254 template <int align>
255 static inline void gmx_simdcall
256 transposeScatterDecrU(double *            base,
257                       const std::int32_t  offset[],
258                       SimdDouble          v0,
259                       SimdDouble          v1,
260                       SimdDouble          v2)
261 {
262     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata0[GMX_SIMD_DOUBLE_WIDTH];
263     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata1[GMX_SIMD_DOUBLE_WIDTH];
264     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata2[GMX_SIMD_DOUBLE_WIDTH];
265
266     store(rdata0, v0);
267     store(rdata1, v1);
268     store(rdata2, v2);
269
270     for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
271     {
272         base[ align * offset[i] + 0] -= rdata0[i];
273         base[ align * offset[i] + 1] -= rdata1[i];
274         base[ align * offset[i] + 2] -= rdata2[i];
275     }
276 }
277
278 static inline void gmx_simdcall
279 expandScalarsToTriplets(SimdDouble    scalar,
280                         SimdDouble *  triplets0,
281                         SimdDouble *  triplets1,
282                         SimdDouble *  triplets2)
283 {
284     triplets0->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
285                                                                            _mm512_castpd_si512(scalar.simdInternal_)));
286     triplets1->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
287                                                                            _mm512_castpd_si512(scalar.simdInternal_)));
288     triplets2->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
289                                                                            _mm512_castpd_si512(scalar.simdInternal_)));
290 }
291
292
293 static inline double gmx_simdcall
294 reduceIncr4ReturnSum(double *    m,
295                      SimdDouble  v0,
296                      SimdDouble  v1,
297                      SimdDouble  v2,
298                      SimdDouble  v3)
299 {
300     double  d;
301     __m512d t0, t1, t2, t3;
302
303     assert(std::size_t(m) % 32 == 0);
304
305     t0 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0.simdInternal_, v2.simdInternal_), _MM_SWIZ_REG_BADC);
306     t2 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2.simdInternal_, v0.simdInternal_);
307     t1 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1.simdInternal_, v3.simdInternal_), _MM_SWIZ_REG_BADC);
308     t3 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3.simdInternal_, v1.simdInternal_);
309     t0 = _mm512_add_pd(t0, t2);
310     t1 = _mm512_add_pd(t1, t3);
311
312     t2 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0, t1), _MM_SWIZ_REG_CDAB);
313     t3 = _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1, t0);
314     t2 = _mm512_add_pd(t2, t3);
315
316     t2 = _mm512_add_pd(t2, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2), _MM_PERM_BADC)));
317
318     t0 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
319     t0 = _mm512_add_pd(t0, t2);
320     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t0);
321
322     t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_BADC));
323     t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_CDAB));
324
325     _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x01), t2);
326     return d;
327 }
328
329 static inline SimdDouble gmx_simdcall
330 loadDualHsimd(const double * m0,
331               const double * m1)
332 {
333     assert(std::size_t(m0) % 32 == 0);
334     assert(std::size_t(m1) % 32 == 0);
335
336     return _mm512_mask_extload_pd(_mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
337                                   m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
338 }
339
340 static inline SimdDouble gmx_simdcall
341 loadDuplicateHsimd(const double * m)
342 {
343     assert(std::size_t(m) % 32 == 0);
344
345     return _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
346 }
347
348 static inline SimdDouble gmx_simdcall
349 loadU1DualHsimd(const double * m)
350 {
351     return _mm512_mask_extload_pd(_mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
352                                   m+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
353 }
354
355
356 static inline void gmx_simdcall
357 storeDualHsimd(double *     m0,
358                double *     m1,
359                SimdDouble   a)
360 {
361     assert(std::size_t(m0) % 32 == 0);
362     assert(std::size_t(m1) % 32 == 0);
363
364     _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), a.simdInternal_);
365     _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), a.simdInternal_);
366 }
367
368 static inline void gmx_simdcall
369 incrDualHsimd(double *     m0,
370               double *     m1,
371               SimdDouble   a)
372 {
373     assert(std::size_t(m0) % 32 == 0);
374     assert(std::size_t(m1) % 32 == 0);
375
376     __m512d x;
377
378     // Update lower half
379     x = _mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
380     x = _mm512_add_pd(x, a.simdInternal_);
381     _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), x);
382
383     // Update upper half
384     x = _mm512_extload_pd(m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
385     x = _mm512_add_pd(x, a.simdInternal_);
386     _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
387 }
388
389 static inline void gmx_simdcall
390 decrHsimd(double *    m,
391           SimdDouble  a)
392 {
393     __m512d t;
394
395     assert(std::size_t(m) % 32 == 0);
396
397     t               = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
398     a.simdInternal_ = _mm512_add_pd(a.simdInternal_, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
399     t               = _mm512_sub_pd(t, a.simdInternal_);
400     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
401 }
402
403
404 template <int align>
405 static inline void gmx_simdcall
406 gatherLoadTransposeHsimd(const double *       base0,
407                          const double *       base1,
408                          const std::int32_t   offset[],
409                          SimdDouble *         v0,
410                          SimdDouble *         v1)
411 {
412     __m512i  idx0, idx1, idx;
413     __m512d  tmp1, tmp2;
414
415     assert(std::size_t(offset) % 16 == 0);
416     assert(std::size_t(base0) % 16 == 0);
417     assert(std::size_t(base1) % 16 == 0);
418     assert(std::size_t(align) % 2  == 0);
419
420     idx0 = _mm512_extload_epi32(offset, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
421
422     idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
423     idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
424
425     idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0x00F0), idx1, _MM_PERM_AAAA);
426
427     tmp1 = _mm512_i32logather_pd(idx, base0, sizeof(double));
428     tmp2 = _mm512_i32logather_pd(idx, base1, sizeof(double));
429
430     v0->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp1), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2), _MM_PERM_BABA));
431     v1->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp2), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1), _MM_PERM_DCDC));
432 }
433
434 static inline double gmx_simdcall
435 reduceIncr4ReturnSumHsimd(double *     m,
436                           SimdDouble   v0,
437                           SimdDouble   v1)
438 {
439     double   d;
440     __m512d  t0, t1;
441
442     assert(std::size_t(m) % 32 == 0);
443
444     t0 = _mm512_add_pd(v0.simdInternal_, _mm512_swizzle_pd(v0.simdInternal_, _MM_SWIZ_REG_BADC));
445     t0 = _mm512_mask_add_pd(t0, _mm512_int2mask(0xCC), v1.simdInternal_, _mm512_swizzle_pd(v1.simdInternal_, _MM_SWIZ_REG_BADC));
446     t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
447     t0 = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(t0), _mm512_int2mask(0xCCCC),
448                                                       _mm512_castpd_ps(t0), _MM_PERM_DCDC));
449
450     t1 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
451     t1 = _mm512_add_pd(t1, t0);
452     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t1);
453
454     t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_BADC));
455     t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
456
457     _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x03), t0);
458     return d;
459 }
460
461 }      // namespace gmx
462
463 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H