src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_simd_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_AVX_512_SIMD_FLOAT_H
  37 #define GMX_SIMD_IMPL_X86_AVX_512_SIMD_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstdint>
  43
  44 #include <immintrin.h>
  45
  46 #include "gromacs/math/utilities.h"
  47 #include "gromacs/utility/real.h"
  48
  49 #include "impl_x86_avx_512_general.h"
  50
  51 namespace gmx
  52 {
  53
  54 class SimdFloat
  55 {
  56 public:
  57     SimdFloat() {}
  58
  59     SimdFloat(float f) : simdInternal_(_mm512_set1_ps(f)) {}
  60
  61     // Internal utility constructor to simplify return statements
  62     SimdFloat(__m512 simd) : simdInternal_(simd) {}
  63
  64     __m512 simdInternal_;
  65 };
  66
  67 class SimdFInt32
  68 {
  69 public:
  70     SimdFInt32() {}
  71
  72     SimdFInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
  73
  74     // Internal utility constructor to simplify return statements
  75     SimdFInt32(__m512i simd) : simdInternal_(simd) {}
  76
  77     __m512i simdInternal_;
  78 };
  79
  80 class SimdFBool
  81 {
  82 public:
  83     SimdFBool() {}
  84
  85     // Internal utility constructor to simplify return statements
  86     SimdFBool(__mmask16 simd) : simdInternal_(simd) {}
  87
  88     __mmask16 simdInternal_;
  89 };
  90
  91 class SimdFIBool
  92 {
  93 public:
  94     SimdFIBool() {}
  95
  96     // Internal utility constructor to simplify return statements
  97     SimdFIBool(__mmask16 simd) : simdInternal_(simd) {}
  98
  99     __mmask16 simdInternal_;
 100 };
 101
 102 static inline SimdFloat gmx_simdcall simdLoad(const float* m, SimdFloatTag = {})
 103 {
 104     assert(std::size_t(m) % 64 == 0);
 105     return { _mm512_load_ps(m) };
 106 }
 107
 108 static inline void gmx_simdcall store(float* m, SimdFloat a)
 109 {
 110     assert(std::size_t(m) % 64 == 0);
 111     _mm512_store_ps(m, a.simdInternal_);
 112 }
 113
 114 static inline SimdFloat gmx_simdcall simdLoadU(const float* m, SimdFloatTag = {})
 115 {
 116     return { _mm512_loadu_ps(m) };
 117 }
 118
 119 static inline void gmx_simdcall storeU(float* m, SimdFloat a)
 120 {
 121     _mm512_storeu_ps(m, a.simdInternal_);
 122 }
 123
 124 static inline SimdFloat gmx_simdcall setZeroF()
 125 {
 126     return { _mm512_setzero_ps() };
 127 }
 128
 129 static inline SimdFInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdFInt32Tag)
 130 {
 131     assert(std::size_t(m) % 64 == 0);
 132     return { _mm512_load_si512(m) };
 133 }
 134
 135 static inline void gmx_simdcall store(std::int32_t* m, SimdFInt32 a)
 136 {
 137     assert(std::size_t(m) % 64 == 0);
 138     _mm512_store_si512(m, a.simdInternal_);
 139 }
 140
 141 static inline SimdFInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdFInt32Tag)
 142 {
 143     return { _mm512_loadu_si512(m) };
 144 }
 145
 146 static inline void gmx_simdcall storeU(std::int32_t* m, SimdFInt32 a)
 147 {
 148     _mm512_storeu_si512(m, a.simdInternal_);
 149 }
 150
 151 static inline SimdFInt32 gmx_simdcall setZeroFI()
 152 {
 153     return { _mm512_setzero_si512() };
 154 }
 155
 156
 157 static inline SimdFloat gmx_simdcall operator&(SimdFloat a, SimdFloat b)
 158 {
 159     return { _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a.simdInternal_),
 160                                                   _mm512_castps_si512(b.simdInternal_))) };
 161 }
 162
 163 static inline SimdFloat gmx_simdcall andNot(SimdFloat a, SimdFloat b)
 164 {
 165     return { _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a.simdInternal_),
 166                                                      _mm512_castps_si512(b.simdInternal_))) };
 167 }
 168
 169 static inline SimdFloat gmx_simdcall operator|(SimdFloat a, SimdFloat b)
 170 {
 171     return { _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a.simdInternal_),
 172                                                  _mm512_castps_si512(b.simdInternal_))) };
 173 }
 174
 175 static inline SimdFloat gmx_simdcall operator^(SimdFloat a, SimdFloat b)
 176 {
 177     return { _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a.simdInternal_),
 178                                                   _mm512_castps_si512(b.simdInternal_))) };
 179 }
 180
 181 static inline SimdFloat gmx_simdcall operator+(SimdFloat a, SimdFloat b)
 182 {
 183     return { _mm512_add_ps(a.simdInternal_, b.simdInternal_) };
 184 }
 185
 186 static inline SimdFloat gmx_simdcall operator-(SimdFloat a, SimdFloat b)
 187 {
 188     return { _mm512_sub_ps(a.simdInternal_, b.simdInternal_) };
 189 }
 190
 191 static inline SimdFloat gmx_simdcall operator-(SimdFloat x)
 192 {
 193     return { _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(x.simdInternal_),
 194                                                   _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)))) };
 195 }
 196
 197 static inline SimdFloat gmx_simdcall operator*(SimdFloat a, SimdFloat b)
 198 {
 199     return { _mm512_mul_ps(a.simdInternal_, b.simdInternal_) };
 200 }
 201
 202 static inline SimdFloat gmx_simdcall fma(SimdFloat a, SimdFloat b, SimdFloat c)
 203 {
 204     return { _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 205 }
 206
 207 static inline SimdFloat gmx_simdcall fms(SimdFloat a, SimdFloat b, SimdFloat c)
 208 {
 209     return { _mm512_fmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 210 }
 211
 212 static inline SimdFloat gmx_simdcall fnma(SimdFloat a, SimdFloat b, SimdFloat c)
 213 {
 214     return { _mm512_fnmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 215 }
 216
 217 static inline SimdFloat gmx_simdcall fnms(SimdFloat a, SimdFloat b, SimdFloat c)
 218 {
 219     return { _mm512_fnmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 220 }
 221
 222 // Override for AVX-512-KNL
 223 #if GMX_SIMD_X86_AVX_512
 224 static inline SimdFloat gmx_simdcall rsqrt(SimdFloat x)
 225 {
 226     return { _mm512_rsqrt14_ps(x.simdInternal_) };
 227 }
 228
 229 static inline SimdFloat gmx_simdcall rcp(SimdFloat x)
 230 {
 231     return { _mm512_rcp14_ps(x.simdInternal_) };
 232 }
 233 #endif
 234
 235 static inline SimdFloat gmx_simdcall maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
 236 {
 237     return { _mm512_mask_add_ps(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_) };
 238 }
 239
 240 static inline SimdFloat gmx_simdcall maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
 241 {
 242     return { _mm512_maskz_mul_ps(m.simdInternal_, a.simdInternal_, b.simdInternal_) };
 243 }
 244
 245 static inline SimdFloat gmx_simdcall maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
 246 {
 247     return { _mm512_maskz_fmadd_ps(m.simdInternal_, a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 248 }
 249
 250 // Override for AVX-512-KNL
 251 #if GMX_SIMD_X86_AVX_512
 252 static inline SimdFloat gmx_simdcall maskzRsqrt(SimdFloat x, SimdFBool m)
 253 {
 254     return { _mm512_maskz_rsqrt14_ps(m.simdInternal_, x.simdInternal_) };
 255 }
 256
 257 static inline SimdFloat gmx_simdcall maskzRcp(SimdFloat x, SimdFBool m)
 258 {
 259     return { _mm512_maskz_rcp14_ps(m.simdInternal_, x.simdInternal_) };
 260 }
 261 #endif
 262
 263 static inline SimdFloat gmx_simdcall abs(SimdFloat x)
 264 {
 265     return { _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
 266                                                      _mm512_castps_si512(x.simdInternal_))) };
 267 }
 268
 269 static inline SimdFloat gmx_simdcall max(SimdFloat a, SimdFloat b)
 270 {
 271     return { _mm512_max_ps(a.simdInternal_, b.simdInternal_) };
 272 }
 273
 274 static inline SimdFloat gmx_simdcall min(SimdFloat a, SimdFloat b)
 275 {
 276     return { _mm512_min_ps(a.simdInternal_, b.simdInternal_) };
 277 }
 278
 279 static inline SimdFloat gmx_simdcall round(SimdFloat x)
 280 {
 281     return { _mm512_roundscale_ps(x.simdInternal_, 0) };
 282 }
 283
 284 static inline SimdFloat gmx_simdcall trunc(SimdFloat x)
 285 {
 286 #if defined(__INTEL_COMPILER) || defined(__ECC)
 287     return { _mm512_trunc_ps(x.simdInternal_) };
 288 #else
 289     return { _mm512_cvtepi32_ps(_mm512_cvttps_epi32(x.simdInternal_)) };
 290 #endif
 291 }
 292
 293 static inline SimdFloat gmx_simdcall frexp(SimdFloat value, SimdFInt32* exponent)
 294 {
 295     __m512  rExponent = _mm512_getexp_ps(value.simdInternal_);
 296     __m512i iExponent = _mm512_cvtps_epi32(rExponent);
 297
 298     exponent->simdInternal_ = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
 299
 300     return { _mm512_getmant_ps(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src) };
 301 }
 302
 303 template<MathOptimization opt = MathOptimization::Safe>
 304 static inline SimdFloat gmx_simdcall ldexp(SimdFloat value, SimdFInt32 exponent)
 305 {
 306     const __m512i exponentBias = _mm512_set1_epi32(127);
 307     __m512i       iExponent    = _mm512_add_epi32(exponent.simdInternal_, exponentBias);
 308
 309     if (opt == MathOptimization::Safe)
 310     {
 311         // Make sure biased argument is not negative
 312         iExponent = _mm512_max_epi32(iExponent, _mm512_setzero_epi32());
 313     }
 314
 315     iExponent = _mm512_slli_epi32(iExponent, 23);
 316
 317     return { _mm512_mul_ps(value.simdInternal_, _mm512_castsi512_ps(iExponent)) };
 318 }
 319
 320 static inline float gmx_simdcall reduce(SimdFloat a)
 321 {
 322     __m512 x = a.simdInternal_;
 323     x        = _mm512_add_ps(x, _mm512_shuffle_f32x4(x, x, 0xEE));
 324     x        = _mm512_add_ps(x, _mm512_shuffle_f32x4(x, x, 0x11));
 325     x        = _mm512_add_ps(x, _mm512_permute_ps(x, 0xEE));
 326     x        = _mm512_add_ps(x, _mm512_permute_ps(x, 0x11));
 327     return *reinterpret_cast<float*>(&x);
 328 }
 329
 330 static inline SimdFBool gmx_simdcall operator==(SimdFloat a, SimdFloat b)
 331 {
 332     return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
 333 }
 334
 335 static inline SimdFBool gmx_simdcall operator!=(SimdFloat a, SimdFloat b)
 336 {
 337     return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_OQ) };
 338 }
 339
 340 static inline SimdFBool gmx_simdcall operator<(SimdFloat a, SimdFloat b)
 341 {
 342     return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OQ) };
 343 }
 344
 345 static inline SimdFBool gmx_simdcall operator<=(SimdFloat a, SimdFloat b)
 346 {
 347     return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OQ) };
 348 }
 349
 350 static inline SimdFBool gmx_simdcall testBits(SimdFloat a)
 351 {
 352     return { _mm512_test_epi32_mask(_mm512_castps_si512(a.simdInternal_),
 353                                     _mm512_castps_si512(a.simdInternal_)) };
 354 }
 355
 356 static inline SimdFBool gmx_simdcall operator&&(SimdFBool a, SimdFBool b)
 357 {
 358     return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
 359 }
 360
 361 static inline SimdFBool gmx_simdcall operator||(SimdFBool a, SimdFBool b)
 362 {
 363     return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
 364 }
 365
 366 static inline bool gmx_simdcall anyTrue(SimdFBool a)
 367 {
 368     return (avx512Mask2Int(a.simdInternal_) != 0);
 369 }
 370
 371 static inline SimdFloat gmx_simdcall selectByMask(SimdFloat a, SimdFBool m)
 372 {
 373     return { _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_) };
 374 }
 375
 376 static inline SimdFloat gmx_simdcall selectByNotMask(SimdFloat a, SimdFBool m)
 377 {
 378     return { _mm512_mask_mov_ps(a.simdInternal_, m.simdInternal_, _mm512_setzero_ps()) };
 379 }
 380
 381 static inline SimdFloat gmx_simdcall blend(SimdFloat a, SimdFloat b, SimdFBool sel)
 382 {
 383     return { _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
 384 }
 385
 386 static inline SimdFloat gmx_simdcall copysign(SimdFloat a, SimdFloat b)
 387 {
 388     return { _mm512_castsi512_ps(_mm512_ternarylogic_epi32(_mm512_castps_si512(a.simdInternal_),
 389                                                            _mm512_castps_si512(b.simdInternal_),
 390                                                            _mm512_set1_epi32(INT32_MIN), 0xD8)) };
 391 }
 392
 393 static inline SimdFInt32 gmx_simdcall operator&(SimdFInt32 a, SimdFInt32 b)
 394 {
 395     return { _mm512_and_epi32(a.simdInternal_, b.simdInternal_) };
 396 }
 397
 398 static inline SimdFInt32 gmx_simdcall andNot(SimdFInt32 a, SimdFInt32 b)
 399 {
 400     return { _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_) };
 401 }
 402
 403 static inline SimdFInt32 gmx_simdcall operator|(SimdFInt32 a, SimdFInt32 b)
 404 {
 405     return { _mm512_or_epi32(a.simdInternal_, b.simdInternal_) };
 406 }
 407
 408 static inline SimdFInt32 gmx_simdcall operator^(SimdFInt32 a, SimdFInt32 b)
 409 {
 410     return { _mm512_xor_epi32(a.simdInternal_, b.simdInternal_) };
 411 }
 412
 413 static inline SimdFInt32 gmx_simdcall operator+(SimdFInt32 a, SimdFInt32 b)
 414 {
 415     return { _mm512_add_epi32(a.simdInternal_, b.simdInternal_) };
 416 }
 417
 418 static inline SimdFInt32 gmx_simdcall operator-(SimdFInt32 a, SimdFInt32 b)
 419 {
 420     return { _mm512_sub_epi32(a.simdInternal_, b.simdInternal_) };
 421 }
 422
 423 static inline SimdFInt32 gmx_simdcall operator*(SimdFInt32 a, SimdFInt32 b)
 424 {
 425     return { _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_) };
 426 }
 427
 428 static inline SimdFIBool gmx_simdcall operator==(SimdFInt32 a, SimdFInt32 b)
 429 {
 430     return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ) };
 431 }
 432
 433 static inline SimdFIBool gmx_simdcall testBits(SimdFInt32 a)
 434 {
 435     return { _mm512_test_epi32_mask(a.simdInternal_, a.simdInternal_) };
 436 }
 437
 438 static inline SimdFIBool gmx_simdcall operator<(SimdFInt32 a, SimdFInt32 b)
 439 {
 440     return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT) };
 441 }
 442
 443 static inline SimdFIBool gmx_simdcall operator&&(SimdFIBool a, SimdFIBool b)
 444 {
 445     return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
 446 }
 447
 448 static inline SimdFIBool gmx_simdcall operator||(SimdFIBool a, SimdFIBool b)
 449 {
 450     return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
 451 }
 452
 453 static inline bool gmx_simdcall anyTrue(SimdFIBool a)
 454 {
 455     return (avx512Mask2Int(a.simdInternal_) != 0);
 456 }
 457
 458 static inline SimdFInt32 gmx_simdcall selectByMask(SimdFInt32 a, SimdFIBool m)
 459 {
 460     return { _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_) };
 461 }
 462
 463 static inline SimdFInt32 gmx_simdcall selectByNotMask(SimdFInt32 a, SimdFIBool m)
 464 {
 465     return { _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32()) };
 466 }
 467
 468 static inline SimdFInt32 gmx_simdcall blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
 469 {
 470     return { _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
 471 }
 472
 473 static inline SimdFInt32 gmx_simdcall cvtR2I(SimdFloat a)
 474 {
 475     return { _mm512_cvtps_epi32(a.simdInternal_) };
 476 }
 477
 478 static inline SimdFInt32 gmx_simdcall cvttR2I(SimdFloat a)
 479 {
 480     return { _mm512_cvttps_epi32(a.simdInternal_) };
 481 }
 482
 483 static inline SimdFloat gmx_simdcall cvtI2R(SimdFInt32 a)
 484 {
 485     return { _mm512_cvtepi32_ps(a.simdInternal_) };
 486 }
 487
 488 static inline SimdFIBool gmx_simdcall cvtB2IB(SimdFBool a)
 489 {
 490     return { a.simdInternal_ };
 491 }
 492
 493 static inline SimdFBool gmx_simdcall cvtIB2B(SimdFIBool a)
 494 {
 495     return { a.simdInternal_ };
 496 }
 497
 498 } // namespace gmx
 499
 500 #endif // GMX_SIMD_IMPL_X86_AVX_512_SIMD_FLOAT_H