src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
  38 #define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cstdint>
  44
  45 #include <immintrin.h>
  46
  47 #include "gromacs/utility/basedefinitions.h"
  48
  49 #include "impl_x86_mic_simd_float.h"
  50
  51 namespace gmx
  52 {
  53
  54 // On MIC it is better to use scatter operations, so we define the load routines
  55 // that use a SIMD offset variable first.
  56
  57 template<int align>
  58 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
  59                                                              SimdFInt32   simdoffset,
  60                                                              SimdFloat*   v0,
  61                                                              SimdFloat*   v1,
  62                                                              SimdFloat*   v2,
  63                                                              SimdFloat*   v3)
  64 {
  65     assert(std::size_t(base) % 16 == 0);
  66     assert(align % 4 == 0);
  67
  68     // All instructions might be latency ~4 on MIC, so we use shifts where we
  69     // only need a single instruction (since the shift parameter is an immediate),
  70     // but multiplication otherwise.
  71     if (align == 4)
  72     {
  73         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
  74     }
  75     else if (align == 8)
  76     {
  77         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
  78     }
  79     else
  80     {
  81         simdoffset = simdoffset * SimdFInt32(align);
  82     }
  83
  84     v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
  85     v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
  86     v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
  87     v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 3, sizeof(float));
  88 }
  89
  90 template<int align>
  91 static inline void gmx_simdcall
  92                    gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
  93 {
  94     // All instructions might be latency ~4 on MIC, so we use shifts where we
  95     // only need a single instruction (since the shift parameter is an immediate),
  96     // but multiplication otherwise.
  97     // For align == 2 we can merge the constant into the scale parameter,
  98     // which can take constants up to 8 in total.
  99     if (align == 2)
 100     {
 101         v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, align * sizeof(float));
 102         v1->simdInternal_ =
 103                 _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, align * sizeof(float));
 104     }
 105     else
 106     {
 107         if (align == 4)
 108         {
 109             simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 110         }
 111         else if (align == 8)
 112         {
 113             simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 114         }
 115         else
 116         {
 117             simdoffset = simdoffset * SimdFInt32(align);
 118         }
 119         v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
 120         v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
 121     }
 122 }
 123
 124 template<int align>
 125 static inline void gmx_simdcall
 126                    gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
 127 {
 128     assert(std::size_t(base) % 8 == 0);
 129     assert(align % 2 == 0);
 130     gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
 131 }
 132
 133 template<int align>
 134 static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
 135                                                     const std::int32_t offset[],
 136                                                     SimdFloat*         v0,
 137                                                     SimdFloat*         v1,
 138                                                     SimdFloat*         v2,
 139                                                     SimdFloat*         v3)
 140 {
 141     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1, v2, v3);
 142 }
 143
 144 template<int align>
 145 static inline void gmx_simdcall
 146                    gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
 147 {
 148     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1);
 149 }
 150
 151 static const int c_simdBestPairAlignmentFloat = 2;
 152
 153 template<int align>
 154 static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
 155                                                      const std::int32_t offset[],
 156                                                      SimdFloat*         v0,
 157                                                      SimdFloat*         v1,
 158                                                      SimdFloat*         v2)
 159 {
 160     SimdFInt32 simdoffset;
 161
 162     assert(std::size_t(offset) % 64 == 0);
 163
 164     simdoffset = simdLoad(offset, SimdFInt32Tag());
 165
 166     // All instructions might be latency ~4 on MIC, so we use shifts where we
 167     // only need a single instruction (since the shift parameter is an immediate),
 168     // but multiplication otherwise.
 169     if (align == 4)
 170     {
 171         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 172     }
 173     else if (align == 8)
 174     {
 175         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 176     }
 177     else
 178     {
 179         simdoffset = simdoffset * SimdFInt32(align);
 180     }
 181
 182     v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
 183     v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
 184     v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
 185 }
 186
 187
 188 template<int align>
 189 static inline void gmx_simdcall
 190                    transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 191 {
 192     SimdFInt32 simdoffset;
 193
 194     assert(std::size_t(offset) % 64 == 0);
 195
 196     simdoffset = simdLoad(offset, SimdFInt32Tag());
 197
 198     // All instructions might be latency ~4 on MIC, so we use shifts where we
 199     // only need a single instruction (since the shift parameter is an immediate),
 200     // but multiplication otherwise.
 201     if (align == 4)
 202     {
 203         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 204     }
 205     else if (align == 8)
 206     {
 207         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 208     }
 209     else
 210     {
 211         simdoffset = simdoffset * SimdFInt32(align);
 212     }
 213
 214     _mm512_i32scatter_ps(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
 215     _mm512_i32scatter_ps(base + 1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
 216     _mm512_i32scatter_ps(base + 2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
 217 }
 218
 219
 220 template<int align>
 221 static inline void gmx_simdcall
 222                    transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 223 {
 224     alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
 225     alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
 226     alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 227
 228     store(rdata0, v0);
 229     store(rdata1, v1);
 230     store(rdata2, v2);
 231
 232     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 233     {
 234         base[align * offset[i] + 0] += rdata0[i];
 235         base[align * offset[i] + 1] += rdata1[i];
 236         base[align * offset[i] + 2] += rdata2[i];
 237     }
 238 }
 239
 240 template<int align>
 241 static inline void gmx_simdcall
 242                    transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 243 {
 244     alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
 245     alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
 246     alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 247
 248     store(rdata0, v0);
 249     store(rdata1, v1);
 250     store(rdata2, v2);
 251
 252     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 253     {
 254         base[align * offset[i] + 0] -= rdata0[i];
 255         base[align * offset[i] + 1] -= rdata1[i];
 256         base[align * offset[i] + 2] -= rdata2[i];
 257     }
 258 }
 259
 260 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
 261                                                         SimdFloat* triplets0,
 262                                                         SimdFloat* triplets1,
 263                                                         SimdFloat* triplets2)
 264 {
 265     triplets0->simdInternal_ = _mm512_castsi512_ps(
 266             _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
 267                                     _mm512_castps_si512(scalar.simdInternal_)));
 268     triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
 269             _mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
 270             _mm512_castps_si512(scalar.simdInternal_)));
 271     triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
 272             _mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
 273             _mm512_castps_si512(scalar.simdInternal_)));
 274 }
 275
 276
 277 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 278 {
 279     float  f;
 280     __m512 t0, t1, t2, t3;
 281
 282     assert(std::size_t(m) % 16 == 0);
 283
 284     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
 285     t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v2.simdInternal_,
 286                             _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
 287     t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
 288     t1 = _mm512_mask_add_ps(t1, _mm512_int2mask(0xCCCC), v3.simdInternal_,
 289                             _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
 290     t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
 291     t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
 292
 293     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
 294     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
 295
 296     t0 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE,
 297                                 _MM_BROADCAST_4X16, _MM_HINT_NONE);
 298     t0 = _mm512_add_ps(t0, t2);
 299     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
 300
 301     t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_BADC));
 302     t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_CDAB));
 303
 304     _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t2);
 305     return f;
 306 }
 307
 308 static inline SimdFloat gmx_simdcall loadDualHsimd(const float* m0, const float* m1)
 309 {
 310     assert(std::size_t(m0) % 32 == 0);
 311     assert(std::size_t(m1) % 32 == 0);
 312
 313     return _mm512_castpd_ps(_mm512_mask_extload_pd(
 314             _mm512_extload_pd(reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE,
 315                               _MM_BROADCAST_4X8, _MM_HINT_NONE),
 316             _mm512_int2mask(0xF0), reinterpret_cast<const double*>(m1), _MM_UPCONV_PD_NONE,
 317             _MM_BROADCAST_4X8, _MM_HINT_NONE));
 318 }
 319
 320 static inline SimdFloat gmx_simdcall loadDuplicateHsimd(const float* m)
 321 {
 322     assert(std::size_t(m) % 32 == 0);
 323
 324     return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m),
 325                                               _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 326 }
 327
 328 static inline SimdFloat gmx_simdcall loadU1DualHsimd(const float* m)
 329 {
 330     return _mm512_mask_extload_ps(
 331             _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE),
 332             _mm512_int2mask(0xFF00), m + 1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
 333 }
 334
 335
 336 static inline void gmx_simdcall storeDualHsimd(float* m0, float* m1, SimdFloat a)
 337 {
 338     __m512 t0;
 339
 340     assert(std::size_t(m0) % 32 == 0);
 341     assert(std::size_t(m1) % 32 == 0);
 342
 343     _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), a.simdInternal_);
 344     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
 345 }
 346
 347 static inline void gmx_simdcall incrDualHsimd(float* m0, float* m1, SimdFloat a)
 348 {
 349     assert(std::size_t(m0) % 32 == 0);
 350     assert(std::size_t(m1) % 32 == 0);
 351
 352     __m512 x;
 353
 354     // Update lower half
 355     x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE,
 356                                            _MM_BROADCAST_4X8, _MM_HINT_NONE));
 357     x = _mm512_add_ps(x, a.simdInternal_);
 358     _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
 359
 360     // Update upper half
 361     x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m1), _MM_UPCONV_PD_NONE,
 362                                            _MM_BROADCAST_4X8, _MM_HINT_NONE));
 363     x = _mm512_add_ps(x, a.simdInternal_);
 364     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
 365 }
 366
 367 static inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
 368 {
 369     __m512 t;
 370
 371     assert(std::size_t(m) % 32 == 0);
 372
 373     t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE,
 374                                            _MM_BROADCAST_4X8, _MM_HINT_NONE));
 375     a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
 376     t = _mm512_sub_ps(t, a.simdInternal_);
 377     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
 378 }
 379
 380
 381 template<int align>
 382 static inline void gmx_simdcall gatherLoadTransposeHsimd(const float*       base0,
 383                                                          const float*       base1,
 384                                                          const std::int32_t offset[],
 385                                                          SimdFloat*         v0,
 386                                                          SimdFloat*         v1)
 387 {
 388     __m512i idx0, idx1, idx;
 389     __m512  tmp1, tmp2;
 390
 391     assert(std::size_t(offset) % 32 == 0);
 392     assert(std::size_t(base0) % 8 == 0);
 393     assert(std::size_t(base1) % 8 == 0);
 394     assert(std::size_t(align) % 2 == 0);
 395
 396     idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset);
 397
 398     idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
 399     idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
 400
 401     idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0xFF00), idx1, _MM_PERM_BABA);
 402
 403     tmp1 = _mm512_i32gather_ps(idx, base0, sizeof(float));
 404     tmp2 = _mm512_i32gather_ps(idx, base1, sizeof(float));
 405
 406     v0->simdInternal_ = _mm512_mask_permute4f128_ps(tmp1, _mm512_int2mask(0xFF00), tmp2, _MM_PERM_BABA);
 407     v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
 408 }
 409
 410 static inline float gmx_simdcall reduceIncr4ReturnSumHsimd(float* m, SimdFloat v0, SimdFloat v1)
 411 {
 412     float  f;
 413     __m512 t0, t1;
 414
 415     assert(std::size_t(m) % 32 == 0);
 416
 417     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
 418     t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v1.simdInternal_,
 419                             _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
 420     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
 421     t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
 422     t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
 423     t1 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE,
 424                                 _MM_BROADCAST_4X16, _MM_HINT_NONE);
 425     t1 = _mm512_add_ps(t1, t0);
 426     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
 427
 428     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_BADC));
 429     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
 430
 431     _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t0);
 432     return f;
 433 }
 434
 435 } // namespace gmx
 436
 437 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H