src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
  36 #define GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
  37
  38 #include "config.h"
  39
  40 #include <cassert>
  41 #include <cstddef>
  42 #include <cstdint>
  43
  44 #include <arm_neon.h>
  45
  46 #include "gromacs/math/utilities.h"
  47
  48 namespace gmx
  49 {
  50
  51 class SimdFloat
  52 {
  53     public:
  54         SimdFloat() {}
  55
  56         SimdFloat(float f) : simdInternal_(vdupq_n_f32(f)) {}
  57
  58         // Internal utility constructor to simplify return statements
  59         SimdFloat(float32x4_t simd) : simdInternal_(simd) {}
  60
  61         float32x4_t  simdInternal_;
  62 };
  63
  64 class SimdFInt32
  65 {
  66     public:
  67         SimdFInt32() {}
  68
  69         SimdFInt32(std::int32_t i) : simdInternal_(vdupq_n_s32(i)) {}
  70
  71         // Internal utility constructor to simplify return statements
  72         SimdFInt32(int32x4_t simd) : simdInternal_(simd) {}
  73
  74         int32x4_t  simdInternal_;
  75 };
  76
  77 class SimdFBool
  78 {
  79     public:
  80         SimdFBool() {}
  81
  82         SimdFBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {}
  83
  84         // Internal utility constructor to simplify return statements
  85         SimdFBool(uint32x4_t simd) : simdInternal_(simd) {}
  86
  87         uint32x4_t  simdInternal_;
  88 };
  89
  90 class SimdFIBool
  91 {
  92     public:
  93         SimdFIBool() {}
  94
  95         SimdFIBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {}
  96
  97         // Internal utility constructor to simplify return statements
  98         SimdFIBool(uint32x4_t simd) : simdInternal_(simd) {}
  99
 100         uint32x4_t  simdInternal_;
 101 };
 102
 103 static inline SimdFloat gmx_simdcall
 104 simdLoad(const float *m, SimdFloatTag = {})
 105 {
 106     assert(std::size_t(m) % 16 == 0);
 107     return {
 108                vld1q_f32(m)
 109     };
 110 }
 111
 112 static inline void gmx_simdcall
 113 store(float *m, SimdFloat a)
 114 {
 115     assert(std::size_t(m) % 16 == 0);
 116     vst1q_f32(m, a.simdInternal_);
 117 }
 118
 119 static inline SimdFloat gmx_simdcall
 120 simdLoadU(const float *m, SimdFloatTag = {})
 121 {
 122     return {
 123                vld1q_f32(m)
 124     };
 125 }
 126
 127 static inline void gmx_simdcall
 128 storeU(float *m, SimdFloat a)
 129 {
 130     vst1q_f32(m, a.simdInternal_);
 131 }
 132
 133 static inline SimdFloat gmx_simdcall
 134 setZeroF()
 135 {
 136     return {
 137                vdupq_n_f32(0.0F)
 138     };
 139 }
 140
 141 static inline SimdFInt32 gmx_simdcall
 142 simdLoad(const std::int32_t * m, SimdFInt32Tag)
 143 {
 144     assert(std::size_t(m) % 16 == 0);
 145     return {
 146                vld1q_s32(m)
 147     };
 148 }
 149
 150 static inline void gmx_simdcall
 151 store(std::int32_t * m, SimdFInt32 a)
 152 {
 153     assert(std::size_t(m) % 16 == 0);
 154     vst1q_s32(m, a.simdInternal_);
 155 }
 156
 157 static inline SimdFInt32 gmx_simdcall
 158 simdLoadU(const std::int32_t *m, SimdFInt32Tag)
 159 {
 160     return {
 161                vld1q_s32(m)
 162     };
 163 }
 164
 165 static inline void gmx_simdcall
 166 storeU(std::int32_t * m, SimdFInt32 a)
 167 {
 168     vst1q_s32(m, a.simdInternal_);
 169 }
 170
 171 static inline SimdFInt32 gmx_simdcall
 172 setZeroFI()
 173 {
 174     return {
 175                vdupq_n_s32(0)
 176     };
 177 }
 178
 179 template<int index> gmx_simdcall
 180 static inline std::int32_t
 181 extract(SimdFInt32 a)
 182 {
 183     return vgetq_lane_s32(a.simdInternal_, index);
 184 }
 185
 186 static inline SimdFloat gmx_simdcall
 187 operator&(SimdFloat a, SimdFloat b)
 188 {
 189     return {
 190                vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a.simdInternal_),
 191                                                vreinterpretq_s32_f32(b.simdInternal_)))
 192     };
 193 }
 194
 195 static inline SimdFloat gmx_simdcall
 196 andNot(SimdFloat a, SimdFloat b)
 197 {
 198     return {
 199                vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b.simdInternal_),
 200                                                vreinterpretq_s32_f32(a.simdInternal_)))
 201     };
 202 }
 203
 204 static inline SimdFloat gmx_simdcall
 205 operator|(SimdFloat a, SimdFloat b)
 206 {
 207     return {
 208                vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a.simdInternal_),
 209                                                vreinterpretq_s32_f32(b.simdInternal_)))
 210     };
 211 }
 212
 213 static inline SimdFloat gmx_simdcall
 214 operator^(SimdFloat a, SimdFloat b)
 215 {
 216     return {
 217                vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a.simdInternal_),
 218                                                vreinterpretq_s32_f32(b.simdInternal_)))
 219     };
 220 }
 221
 222 static inline SimdFloat gmx_simdcall
 223 operator+(SimdFloat a, SimdFloat b)
 224 {
 225     return {
 226                vaddq_f32(a.simdInternal_, b.simdInternal_)
 227     };
 228 }
 229
 230 static inline SimdFloat gmx_simdcall
 231 operator-(SimdFloat a, SimdFloat b)
 232 {
 233     return {
 234                vsubq_f32(a.simdInternal_, b.simdInternal_)
 235     };
 236 }
 237
 238 static inline SimdFloat gmx_simdcall
 239 operator-(SimdFloat x)
 240 {
 241     return {
 242                vnegq_f32(x.simdInternal_)
 243     };
 244 }
 245
 246 static inline SimdFloat gmx_simdcall
 247 operator*(SimdFloat a, SimdFloat b)
 248 {
 249     return {
 250                vmulq_f32(a.simdInternal_, b.simdInternal_)
 251     };
 252 }
 253
 254 // Override for Neon-Asimd
 255 #if GMX_SIMD_ARM_NEON
 256 static inline SimdFloat gmx_simdcall
 257 fma(SimdFloat a, SimdFloat b, SimdFloat c)
 258 {
 259     return {
 260 #ifdef __ARM_FEATURE_FMA
 261                vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 262 #else
 263                vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 264 #endif
 265     };
 266 }
 267
 268 static inline SimdFloat gmx_simdcall
 269 fms(SimdFloat a, SimdFloat b, SimdFloat c)
 270 {
 271     return {
 272 #ifdef __ARM_FEATURE_FMA
 273                vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 274 #else
 275                vnegq_f32(vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 276 #endif
 277     };
 278 }
 279
 280 static inline SimdFloat gmx_simdcall
 281 fnma(SimdFloat a, SimdFloat b, SimdFloat c)
 282 {
 283     return {
 284 #ifdef __ARM_FEATURE_FMA
 285                vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 286 #else
 287                vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 288 #endif
 289     };
 290 }
 291
 292 static inline SimdFloat gmx_simdcall
 293 fnms(SimdFloat a, SimdFloat b, SimdFloat c)
 294 {
 295     return {
 296 #ifdef __ARM_FEATURE_FMA
 297                vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 298 #else
 299                vnegq_f32(vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 300 #endif
 301     };
 302 }
 303 #endif
 304
 305 static inline SimdFloat gmx_simdcall
 306 rsqrt(SimdFloat x)
 307 {
 308     return {
 309                vrsqrteq_f32(x.simdInternal_)
 310     };
 311 }
 312
 313 // The SIMD implementation seems to overflow when we square lu for
 314 // values close to FLOAT_MAX, so we fall back on the version in
 315 // simd_math.h, which is probably slightly slower.
 316 #if GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
 317 static inline SimdFloat gmx_simdcall
 318 rsqrtIter(SimdFloat lu, SimdFloat x)
 319 {
 320     return {
 321                vmulq_f32(lu.simdInternal_, vrsqrtsq_f32(vmulq_f32(lu.simdInternal_, lu.simdInternal_), x.simdInternal_))
 322     };
 323 }
 324 #endif
 325
 326 static inline SimdFloat gmx_simdcall
 327 rcp(SimdFloat x)
 328 {
 329     return {
 330                vrecpeq_f32(x.simdInternal_)
 331     };
 332 }
 333
 334 static inline SimdFloat gmx_simdcall
 335 rcpIter(SimdFloat lu, SimdFloat x)
 336 {
 337     return {
 338                vmulq_f32(lu.simdInternal_, vrecpsq_f32(lu.simdInternal_, x.simdInternal_))
 339     };
 340 }
 341
 342 static inline SimdFloat gmx_simdcall
 343 maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
 344 {
 345     b.simdInternal_ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(b.simdInternal_),
 346                                                       m.simdInternal_));
 347
 348     return {
 349                vaddq_f32(a.simdInternal_, b.simdInternal_)
 350     };
 351 }
 352
 353 static inline SimdFloat gmx_simdcall
 354 maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
 355 {
 356     SimdFloat tmp = a * b;
 357
 358     return {
 359                vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp.simdInternal_),
 360                                                m.simdInternal_))
 361     };
 362 }
 363
 364 static inline SimdFloat gmx_simdcall
 365 maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
 366 {
 367 #ifdef __ARM_FEATURE_FMA
 368     float32x4_t tmp = vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_);
 369 #else
 370     float32x4_t tmp = vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_);
 371 #endif
 372
 373     return {
 374                vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp),
 375                                                m.simdInternal_))
 376     };
 377 }
 378
 379 static inline SimdFloat gmx_simdcall
 380 maskzRsqrt(SimdFloat x, SimdFBool m)
 381 {
 382     // The result will always be correct since we mask the result with m, but
 383     // for debug builds we also want to make sure not to generate FP exceptions
 384 #ifndef NDEBUG
 385     x.simdInternal_ = vbslq_f32(m.simdInternal_, x.simdInternal_, vdupq_n_f32(1.0F));
 386 #endif
 387     return {
 388                vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrsqrteq_f32(x.simdInternal_)),
 389                                                m.simdInternal_))
 390     };
 391 }
 392
 393 static inline SimdFloat gmx_simdcall
 394 maskzRcp(SimdFloat x, SimdFBool m)
 395 {
 396     // The result will always be correct since we mask the result with m, but
 397     // for debug builds we also want to make sure not to generate FP exceptions
 398 #ifndef NDEBUG
 399     x.simdInternal_ = vbslq_f32(m.simdInternal_, x.simdInternal_, vdupq_n_f32(1.0F));
 400 #endif
 401     return {
 402                vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrecpeq_f32(x.simdInternal_)),
 403                                                m.simdInternal_))
 404     };
 405 }
 406
 407 static inline SimdFloat gmx_simdcall
 408 abs(SimdFloat x)
 409 {
 410     return {
 411                vabsq_f32( x.simdInternal_ )
 412     };
 413 }
 414
 415 static inline SimdFloat gmx_simdcall
 416 max(SimdFloat a, SimdFloat b)
 417 {
 418     return {
 419                vmaxq_f32(a.simdInternal_, b.simdInternal_)
 420     };
 421 }
 422
 423 static inline SimdFloat gmx_simdcall
 424 min(SimdFloat a, SimdFloat b)
 425 {
 426     return {
 427                vminq_f32(a.simdInternal_, b.simdInternal_)
 428     };
 429 }
 430
 431 // Round and trunc operations are defined at the end of this file, since they
 432 // need to use float-to-integer and integer-to-float conversions.
 433
 434 static inline SimdFloat gmx_simdcall
 435 frexp(SimdFloat value, SimdFInt32 * exponent)
 436 {
 437     const int32x4_t    exponentMask   = vdupq_n_s32(0x7F800000);
 438     const int32x4_t    mantissaMask   = vdupq_n_s32(0x807FFFFF);
 439     const int32x4_t    exponentBias   = vdupq_n_s32(126); // add 1 to make our definition identical to frexp()
 440     const float32x4_t  half           = vdupq_n_f32(0.5F);
 441     int32x4_t          iExponent;
 442
 443     iExponent               = vandq_s32(vreinterpretq_s32_f32(value.simdInternal_), exponentMask);
 444     iExponent               = vsubq_s32(vshrq_n_s32(iExponent, 23), exponentBias);
 445     exponent->simdInternal_ = iExponent;
 446
 447     return {
 448                vreinterpretq_f32_s32(vorrq_s32(vandq_s32(vreinterpretq_s32_f32(value.simdInternal_),
 449                                                          mantissaMask),
 450                                                vreinterpretq_s32_f32(half)))
 451     };
 452 }
 453
 454 template <MathOptimization opt = MathOptimization::Safe>
 455 static inline SimdFloat gmx_simdcall
 456 ldexp(SimdFloat value, SimdFInt32 exponent)
 457 {
 458     const int32x4_t exponentBias = vdupq_n_s32(127);
 459     int32x4_t       iExponent    = vaddq_s32(exponent.simdInternal_, exponentBias);
 460
 461     if (opt == MathOptimization::Safe)
 462     {
 463         // Make sure biased argument is not negative
 464         iExponent = vmaxq_s32(iExponent, vdupq_n_s32(0));
 465     }
 466
 467     iExponent = vshlq_n_s32( iExponent, 23);
 468
 469     return {
 470                vmulq_f32(value.simdInternal_, vreinterpretq_f32_s32(iExponent))
 471     };
 472 }
 473
 474 // Override for Neon-Asimd
 475 #if GMX_SIMD_ARM_NEON
 476 static inline float gmx_simdcall
 477 reduce(SimdFloat a)
 478 {
 479     float32x4_t x = a.simdInternal_;
 480     float32x4_t y = vextq_f32(x, x, 2);
 481
 482     x = vaddq_f32(x, y);
 483     y = vextq_f32(x, x, 1);
 484     x = vaddq_f32(x, y);
 485     return vgetq_lane_f32(x, 0);
 486 }
 487 #endif
 488
 489 static inline SimdFBool gmx_simdcall
 490 operator==(SimdFloat a, SimdFloat b)
 491 {
 492     return {
 493                vceqq_f32(a.simdInternal_, b.simdInternal_)
 494     };
 495 }
 496
 497 static inline SimdFBool gmx_simdcall
 498 operator!=(SimdFloat a, SimdFloat b)
 499 {
 500     return {
 501                vmvnq_u32(vceqq_f32(a.simdInternal_, b.simdInternal_))
 502     };
 503 }
 504
 505 static inline SimdFBool gmx_simdcall
 506 operator<(SimdFloat a, SimdFloat b)
 507 {
 508     return {
 509                vcltq_f32(a.simdInternal_, b.simdInternal_)
 510     };
 511 }
 512
 513 static inline SimdFBool gmx_simdcall
 514 operator<=(SimdFloat a, SimdFloat b)
 515 {
 516     return {
 517                vcleq_f32(a.simdInternal_, b.simdInternal_)
 518     };
 519 }
 520
 521 static inline SimdFBool gmx_simdcall
 522 testBits(SimdFloat a)
 523 {
 524     uint32x4_t tmp = vreinterpretq_u32_f32(a.simdInternal_);
 525
 526     return {
 527                vtstq_u32(tmp, tmp)
 528     };
 529 }
 530
 531 static inline SimdFBool gmx_simdcall
 532 operator&&(SimdFBool a, SimdFBool b)
 533 {
 534
 535     return {
 536                vandq_u32(a.simdInternal_, b.simdInternal_)
 537     };
 538 }
 539
 540 static inline SimdFBool gmx_simdcall
 541 operator||(SimdFBool a, SimdFBool b)
 542 {
 543     return {
 544                vorrq_u32(a.simdInternal_, b.simdInternal_)
 545     };
 546 }
 547
 548 // Override for Neon-Asimd
 549 #if GMX_SIMD_ARM_NEON
 550 static inline bool gmx_simdcall
 551 anyTrue(SimdFBool a)
 552 {
 553     uint32x4_t x = a.simdInternal_;
 554     uint32x4_t y = vextq_u32(x, x, 2);
 555
 556     x = vorrq_u32(x, y);
 557     y = vextq_u32(x, x, 1);
 558     x = vorrq_u32(x, y);
 559     return (vgetq_lane_u32(x, 0) != 0);
 560 }
 561 #endif
 562
 563 static inline SimdFloat gmx_simdcall
 564 selectByMask(SimdFloat a, SimdFBool m)
 565 {
 566     return {
 567                vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.simdInternal_),
 568                                                m.simdInternal_))
 569     };
 570 }
 571
 572 static inline SimdFloat gmx_simdcall
 573 selectByNotMask(SimdFloat a, SimdFBool m)
 574 {
 575     return {
 576                vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.simdInternal_),
 577                                                m.simdInternal_))
 578     };
 579 }
 580
 581 static inline SimdFloat gmx_simdcall
 582 blend(SimdFloat a, SimdFloat b, SimdFBool sel)
 583 {
 584     return {
 585                vbslq_f32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
 586     };
 587 }
 588
 589 static inline SimdFInt32 gmx_simdcall
 590 operator&(SimdFInt32 a, SimdFInt32 b)
 591 {
 592     return {
 593                vandq_s32(a.simdInternal_, b.simdInternal_)
 594     };
 595 }
 596
 597 static inline SimdFInt32 gmx_simdcall
 598 andNot(SimdFInt32 a, SimdFInt32 b)
 599 {
 600     return {
 601                vbicq_s32(b.simdInternal_, a.simdInternal_)
 602     };
 603 }
 604
 605 static inline SimdFInt32 gmx_simdcall
 606 operator|(SimdFInt32 a, SimdFInt32 b)
 607 {
 608     return {
 609                vorrq_s32(a.simdInternal_, b.simdInternal_)
 610     };
 611 }
 612
 613 static inline SimdFInt32 gmx_simdcall
 614 operator^(SimdFInt32 a, SimdFInt32 b)
 615 {
 616     return {
 617                veorq_s32(a.simdInternal_, b.simdInternal_)
 618     };
 619 }
 620
 621 static inline SimdFInt32 gmx_simdcall
 622 operator+(SimdFInt32 a, SimdFInt32 b)
 623 {
 624     return {
 625                vaddq_s32(a.simdInternal_, b.simdInternal_)
 626     };
 627 }
 628
 629 static inline SimdFInt32 gmx_simdcall
 630 operator-(SimdFInt32 a, SimdFInt32 b)
 631 {
 632     return {
 633                vsubq_s32(a.simdInternal_, b.simdInternal_)
 634     };
 635 }
 636
 637 static inline SimdFInt32 gmx_simdcall
 638 operator*(SimdFInt32 a, SimdFInt32 b)
 639 {
 640     return {
 641                vmulq_s32(a.simdInternal_, b.simdInternal_)
 642     };
 643 }
 644
 645 static inline SimdFIBool gmx_simdcall
 646 operator==(SimdFInt32 a, SimdFInt32 b)
 647 {
 648     return {
 649                vceqq_s32(a.simdInternal_, b.simdInternal_)
 650     };
 651 }
 652
 653 static inline SimdFIBool gmx_simdcall
 654 testBits(SimdFInt32 a)
 655 {
 656     return {
 657                vtstq_s32(a.simdInternal_, a.simdInternal_)
 658     };
 659 }
 660
 661 static inline SimdFIBool gmx_simdcall
 662 operator<(SimdFInt32 a, SimdFInt32 b)
 663 {
 664     return {
 665                vcltq_s32(a.simdInternal_, b.simdInternal_)
 666     };
 667 }
 668
 669 static inline SimdFIBool gmx_simdcall
 670 operator&&(SimdFIBool a, SimdFIBool b)
 671 {
 672     return {
 673                vandq_u32(a.simdInternal_, b.simdInternal_)
 674     };
 675 }
 676
 677 static inline SimdFIBool gmx_simdcall
 678 operator||(SimdFIBool a, SimdFIBool b)
 679 {
 680     return {
 681                vorrq_u32(a.simdInternal_, b.simdInternal_)
 682     };
 683 }
 684
 685 // Override for Neon-Asimd
 686 #if GMX_SIMD_ARM_NEON
 687 static inline bool gmx_simdcall
 688 anyTrue(SimdFIBool a)
 689 {
 690     uint32x4_t x = a.simdInternal_;
 691     uint32x4_t y = vextq_u32(x, x, 2);
 692
 693     x = vorrq_u32(x, y);
 694     y = vextq_u32(x, x, 1);
 695     x = vorrq_u32(x, y);
 696     return (vgetq_lane_u32(x, 0) != 0);
 697 }
 698 #endif
 699
 700 static inline SimdFInt32 gmx_simdcall
 701 selectByMask(SimdFInt32 a, SimdFIBool m)
 702 {
 703     return {
 704                vandq_s32(a.simdInternal_, vreinterpretq_s32_u32(m.simdInternal_))
 705     };
 706 }
 707
 708 static inline SimdFInt32 gmx_simdcall
 709 selectByNotMask(SimdFInt32 a, SimdFIBool m)
 710 {
 711     return {
 712                vbicq_s32(a.simdInternal_, vreinterpretq_s32_u32(m.simdInternal_))
 713     };
 714 }
 715
 716 static inline SimdFInt32 gmx_simdcall
 717 blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
 718 {
 719     return {
 720                vbslq_s32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
 721     };
 722 }
 723
 724 // Override for Neon-Asimd
 725 #if GMX_SIMD_ARM_NEON
 726 static inline SimdFInt32 gmx_simdcall
 727 cvtR2I(SimdFloat a)
 728 {
 729     float32x4_t signBitOfA = vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(a.simdInternal_)));
 730     float32x4_t half       = vdupq_n_f32(0.5F);
 731     float32x4_t corr       = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half), vreinterpretq_u32_f32(signBitOfA)));
 732
 733     return {
 734                vcvtq_s32_f32(vaddq_f32(a.simdInternal_, corr))
 735     };
 736 }
 737 #endif
 738
 739 static inline SimdFInt32 gmx_simdcall
 740 cvttR2I(SimdFloat a)
 741 {
 742     return {
 743                vcvtq_s32_f32(a.simdInternal_)
 744     };
 745 }
 746
 747 static inline SimdFloat gmx_simdcall
 748 cvtI2R(SimdFInt32 a)
 749 {
 750     return {
 751                vcvtq_f32_s32(a.simdInternal_)
 752     };
 753 }
 754
 755 static inline SimdFIBool gmx_simdcall
 756 cvtB2IB(SimdFBool a)
 757 {
 758     return {
 759                a.simdInternal_
 760     };
 761 }
 762
 763 static inline SimdFBool gmx_simdcall
 764 cvtIB2B(SimdFIBool a)
 765 {
 766     return {
 767                a.simdInternal_
 768     };
 769 }
 770
 771 // Override for Neon-Asimd
 772 #if GMX_SIMD_ARM_NEON
 773 static inline SimdFloat gmx_simdcall
 774 round(SimdFloat x)
 775 {
 776     return cvtI2R(cvtR2I(x));
 777 }
 778
 779 static inline SimdFloat gmx_simdcall
 780 trunc(SimdFloat x)
 781 {
 782     return cvtI2R(cvttR2I(x));
 783 }
 784 #endif
 785
 786 }      // namespace gmx
 787
 788 #endif // GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H