src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
  37 #define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstdint>
  43
  44 #include <immintrin.h>
  45
  46 #include "gromacs/utility/basedefinitions.h"
  47
  48 #include "impl_x86_mic_simd_float.h"
  49
  50 namespace gmx
  51 {
  52
  53 // On MIC it is better to use scatter operations, so we define the load routines
  54 // that use a SIMD offset variable first.
  55
  56 template <int align>
  57 static inline void gmx_simdcall
  58 gatherLoadBySimdIntTranspose(const float *  base,
  59                              SimdFInt32     simdoffset,
  60                              SimdFloat *    v0,
  61                              SimdFloat *    v1,
  62                              SimdFloat *    v2,
  63                              SimdFloat *    v3)
  64 {
  65     assert(std::size_t(base) % 16 == 0);
  66     assert(align % 4 == 0);
  67
  68     // All instructions might be latency ~4 on MIC, so we use shifts where we
  69     // only need a single instruction (since the shift parameter is an immediate),
  70     // but multiplication otherwise.
  71     if (align == 4)
  72     {
  73         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
  74     }
  75     else if (align == 8)
  76     {
  77         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
  78     }
  79     else
  80     {
  81         simdoffset = simdoffset * SimdFInt32(align);
  82     }
  83
  84     v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
  85     v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
  86     v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
  87     v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+3, sizeof(float));
  88 }
  89
  90 template <int align>
  91 static inline void gmx_simdcall
  92 gatherLoadUBySimdIntTranspose(const float *  base,
  93                               SimdFInt32     simdoffset,
  94                               SimdFloat *    v0,
  95                               SimdFloat *    v1)
  96 {
  97     // All instructions might be latency ~4 on MIC, so we use shifts where we
  98     // only need a single instruction (since the shift parameter is an immediate),
  99     // but multiplication otherwise.
 100     // For align == 2 we can merge the constant into the scale parameter,
 101     // which can take constants up to 8 in total.
 102     if (align == 2)
 103     {
 104         v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   align * sizeof(float));
 105         v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, align * sizeof(float));
 106     }
 107     else
 108     {
 109         if (align == 4)
 110         {
 111             simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 112         }
 113         else if (align == 8)
 114         {
 115             simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 116         }
 117         else
 118         {
 119             simdoffset = simdoffset * SimdFInt32(align);
 120         }
 121         v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
 122         v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
 123     }
 124 }
 125
 126 template <int align>
 127 static inline void gmx_simdcall
 128 gatherLoadBySimdIntTranspose(const float *   base,
 129                              SimdFInt32      simdoffset,
 130                              SimdFloat *     v0,
 131                              SimdFloat *     v1)
 132 {
 133     assert(std::size_t(base) % 8 == 0);
 134     assert(align % 2 == 0);
 135     gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
 136 }
 137
 138 template <int align>
 139 static inline void gmx_simdcall
 140 gatherLoadTranspose(const float *        base,
 141                     const std::int32_t   offset[],
 142                     SimdFloat *          v0,
 143                     SimdFloat *          v1,
 144                     SimdFloat *          v2,
 145                     SimdFloat *          v3)
 146 {
 147     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1, v2, v3);
 148 }
 149
 150 template <int align>
 151 static inline void gmx_simdcall
 152 gatherLoadTranspose(const float *        base,
 153                     const std::int32_t   offset[],
 154                     SimdFloat *          v0,
 155                     SimdFloat *          v1)
 156 {
 157     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1);
 158 }
 159
 160 static const int c_simdBestPairAlignmentFloat = 2;
 161
 162 template <int align>
 163 static inline void gmx_simdcall
 164 gatherLoadUTranspose(const float *        base,
 165                      const std::int32_t   offset[],
 166                      SimdFloat *          v0,
 167                      SimdFloat *          v1,
 168                      SimdFloat *          v2)
 169 {
 170     SimdFInt32 simdoffset;
 171
 172     assert(std::size_t(offset) % 64 == 0);
 173
 174     simdoffset = simdLoad(offset, SimdFInt32Tag());
 175
 176     // All instructions might be latency ~4 on MIC, so we use shifts where we
 177     // only need a single instruction (since the shift parameter is an immediate),
 178     // but multiplication otherwise.
 179     if (align == 4)
 180     {
 181         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 182     }
 183     else if (align == 8)
 184     {
 185         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 186     }
 187     else
 188     {
 189         simdoffset = simdoffset * SimdFInt32(align);
 190     }
 191
 192     v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
 193     v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
 194     v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
 195 }
 196
 197
 198 template <int align>
 199 static inline void gmx_simdcall
 200 transposeScatterStoreU(float *              base,
 201                        const std::int32_t   offset[],
 202                        SimdFloat            v0,
 203                        SimdFloat            v1,
 204                        SimdFloat            v2)
 205 {
 206     SimdFInt32 simdoffset;
 207
 208     assert(std::size_t(offset) % 64 == 0);
 209
 210     simdoffset = simdLoad(offset, SimdFInt32Tag());
 211
 212     // All instructions might be latency ~4 on MIC, so we use shifts where we
 213     // only need a single instruction (since the shift parameter is an immediate),
 214     // but multiplication otherwise.
 215     if (align == 4)
 216     {
 217         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 218     }
 219     else if (align == 8)
 220     {
 221         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 222     }
 223     else
 224     {
 225         simdoffset = simdoffset * SimdFInt32(align);
 226     }
 227
 228     _mm512_i32scatter_ps(base,   simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
 229     _mm512_i32scatter_ps(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
 230     _mm512_i32scatter_ps(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
 231 }
 232
 233
 234 template <int align>
 235 static inline void gmx_simdcall
 236 transposeScatterIncrU(float *              base,
 237                       const std::int32_t   offset[],
 238                       SimdFloat            v0,
 239                       SimdFloat            v1,
 240                       SimdFloat            v2)
 241 {
 242     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata0[GMX_SIMD_FLOAT_WIDTH];
 243     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata1[GMX_SIMD_FLOAT_WIDTH];
 244     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata2[GMX_SIMD_FLOAT_WIDTH];
 245
 246     store(rdata0, v0);
 247     store(rdata1, v1);
 248     store(rdata2, v2);
 249
 250     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 251     {
 252         base[ align * offset[i] + 0] += rdata0[i];
 253         base[ align * offset[i] + 1] += rdata1[i];
 254         base[ align * offset[i] + 2] += rdata2[i];
 255     }
 256 }
 257
 258 template <int align>
 259 static inline void gmx_simdcall
 260 transposeScatterDecrU(float *              base,
 261                       const std::int32_t   offset[],
 262                       SimdFloat            v0,
 263                       SimdFloat            v1,
 264                       SimdFloat            v2)
 265 {
 266     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata0[GMX_SIMD_FLOAT_WIDTH];
 267     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata1[GMX_SIMD_FLOAT_WIDTH];
 268     GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata2[GMX_SIMD_FLOAT_WIDTH];
 269
 270     store(rdata0, v0);
 271     store(rdata1, v1);
 272     store(rdata2, v2);
 273
 274     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 275     {
 276         base[ align * offset[i] + 0] -= rdata0[i];
 277         base[ align * offset[i] + 1] -= rdata1[i];
 278         base[ align * offset[i] + 2] -= rdata2[i];
 279     }
 280 }
 281
 282 static inline void gmx_simdcall
 283 expandScalarsToTriplets(SimdFloat    scalar,
 284                         SimdFloat *  triplets0,
 285                         SimdFloat *  triplets1,
 286                         SimdFloat *  triplets2)
 287 {
 288     triplets0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
 289                                                                            _mm512_castps_si512(scalar.simdInternal_)));
 290     triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
 291                                                                            _mm512_castps_si512(scalar.simdInternal_)));
 292     triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
 293                                                                            _mm512_castps_si512(scalar.simdInternal_)));
 294 }
 295
 296
 297 static inline float gmx_simdcall
 298 reduceIncr4ReturnSum(float *    m,
 299                      SimdFloat  v0,
 300                      SimdFloat  v1,
 301                      SimdFloat  v2,
 302                      SimdFloat  v3)
 303 {
 304     float  f;
 305     __m512 t0, t1, t2, t3;
 306
 307     assert(std::size_t(m) % 16 == 0);
 308
 309     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
 310     t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v2.simdInternal_, _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
 311     t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
 312     t1 = _mm512_mask_add_ps(t1, _mm512_int2mask(0xCCCC), v3.simdInternal_, _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
 313     t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
 314     t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
 315
 316     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
 317     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
 318
 319     t0 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
 320     t0 = _mm512_add_ps(t0, t2);
 321     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
 322
 323     t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_BADC));
 324     t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_CDAB));
 325
 326     _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t2);
 327     return f;
 328 }
 329
 330 static inline SimdFloat gmx_simdcall
 331 loadDualHsimd(const float * m0,
 332               const float * m1)
 333 {
 334     assert(std::size_t(m0) % 32 == 0);
 335     assert(std::size_t(m1) % 32 == 0);
 336
 337     return _mm512_castpd_ps(_mm512_mask_extload_pd(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
 338                                                    _mm512_int2mask(0xF0), reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 339 }
 340
 341 static inline SimdFloat gmx_simdcall
 342 loadDuplicateHsimd(const float * m)
 343 {
 344     assert(std::size_t(m) % 32 == 0);
 345
 346     return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 347 }
 348
 349 static inline SimdFloat gmx_simdcall
 350 loadU1DualHsimd(const float * m)
 351 {
 352     return _mm512_mask_extload_ps(_mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), _mm512_int2mask(0xFF00),
 353                                   m+1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
 354 }
 355
 356
 357 static inline void gmx_simdcall
 358 storeDualHsimd(float *     m0,
 359                float *     m1,
 360                SimdFloat   a)
 361 {
 362     __m512 t0;
 363
 364     assert(std::size_t(m0) % 32 == 0);
 365     assert(std::size_t(m1) % 32 == 0);
 366
 367     _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), a.simdInternal_);
 368     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
 369 }
 370
 371 static inline void gmx_simdcall
 372 incrDualHsimd(float *     m0,
 373               float *     m1,
 374               SimdFloat   a)
 375 {
 376     assert(std::size_t(m0) % 32 == 0);
 377     assert(std::size_t(m1) % 32 == 0);
 378
 379     __m512 x;
 380
 381     // Update lower half
 382     x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 383     x = _mm512_add_ps(x, a.simdInternal_);
 384     _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
 385
 386     // Update upper half
 387     x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 388     x = _mm512_add_ps(x, a.simdInternal_);
 389     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
 390 }
 391
 392 static inline void gmx_simdcall
 393 decrHsimd(float *    m,
 394           SimdFloat  a)
 395 {
 396     __m512 t;
 397
 398     assert(std::size_t(m) % 32 == 0);
 399
 400     t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 401     a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
 402     t = _mm512_sub_ps(t, a.simdInternal_);
 403     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
 404 }
 405
 406
 407 template <int align>
 408 static inline void gmx_simdcall
 409 gatherLoadTransposeHsimd(const float *        base0,
 410                          const float *        base1,
 411                          const std::int32_t   offset[],
 412                          SimdFloat *          v0,
 413                          SimdFloat *          v1)
 414 {
 415     __m512i idx0, idx1, idx;
 416     __m512  tmp1, tmp2;
 417
 418     assert(std::size_t(offset) % 32 == 0);
 419     assert(std::size_t(base0) % 8 == 0);
 420     assert(std::size_t(base1) % 8 == 0);
 421     assert(std::size_t(align) % 2 == 0);
 422
 423     idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset);
 424
 425     idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
 426     idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
 427
 428     idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0xFF00), idx1, _MM_PERM_BABA);
 429
 430     tmp1 = _mm512_i32gather_ps(idx, base0, sizeof(float));
 431     tmp2 = _mm512_i32gather_ps(idx, base1, sizeof(float));
 432
 433     v0->simdInternal_ = _mm512_mask_permute4f128_ps(tmp1, _mm512_int2mask(0xFF00), tmp2, _MM_PERM_BABA);
 434     v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
 435 }
 436
 437 static inline float gmx_simdcall
 438 reduceIncr4ReturnSumHsimd(float *     m,
 439                           SimdFloat   v0,
 440                           SimdFloat   v1)
 441 {
 442     float  f;
 443     __m512 t0, t1;
 444
 445     assert(std::size_t(m) % 32 == 0);
 446
 447     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
 448     t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
 449     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
 450     t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
 451     t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
 452     t1 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
 453     t1 = _mm512_add_ps(t1, t0);
 454     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
 455
 456     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_BADC));
 457     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
 458
 459     _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t0);
 460     return f;
 461 }
 462
 463 }      // namespace gmx
 464
 465 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H