src/gromacs/simd/impl_reference/impl_reference_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2019,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
  37 #define GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference implementation, SIMD4 single precision.
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cmath>
  52 #include <cstddef>
  53 #include <cstdint>
  54
  55 #include <algorithm>
  56 #include <array>
  57
  58 #include "impl_reference_definitions.h"
  59
  60 namespace gmx
  61 {
  62
  63 /*! \cond libapi */
  64 /*! \addtogroup module_simd */
  65 /*! \{ */
  66
  67 /*! \name Constant width-4 single precision SIMD types and instructions
  68  * \{
  69  */
  70
  71 /*! \libinternal \brief SIMD4 float type.
  72  *
  73  * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
  74  *
  75  * \note This variable cannot be placed inside other structures or classes, since
  76  *       some compilers (including at least clang-3.7) appear to lose the
  77  *       alignment. This is likely particularly severe when allocating such
  78  *       memory on the heap, but it occurs for stack structures too.
  79  */
  80 class Simd4Float
  81 {
  82 public:
  83     Simd4Float() {}
  84
  85     //! \brief Construct from scalar
  86     Simd4Float(float f) { simdInternal_.fill(f); }
  87
  88     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
  89      *
  90      * This has to be public to enable usage in combination with static inline
  91      * functions, but it should never, EVER, be accessed by any code outside
  92      * the corresponding implementation directory since the type will depend
  93      * on the architecture.
  94      */
  95     std::array<float, GMX_SIMD4_WIDTH> simdInternal_;
  96 };
  97
  98 /*! \libinternal  \brief SIMD4 variable type to use for logical comparisons on floats.
  99  *
 100  * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
 101  *
 102  * \note This variable cannot be placed inside other structures or classes, since
 103  *       some compilers (including at least clang-3.7) appear to lose the
 104  *       alignment. This is likely particularly severe when allocating such
 105  *       memory on the heap, but it occurs for stack structures too.
 106  */
 107 class Simd4FBool
 108 {
 109 public:
 110     Simd4FBool() {}
 111
 112     //! \brief Construct from scalar bool
 113     Simd4FBool(bool b) { simdInternal_.fill(b); }
 114
 115     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
 116      *
 117      * This has to be public to enable usage in combination with static inline
 118      * functions, but it should never, EVER, be accessed by any code outside
 119      * the corresponding implementation directory since the type will depend
 120      * on the architecture.
 121      */
 122     std::array<bool, GMX_SIMD4_WIDTH> simdInternal_;
 123 };
 124
 125 /*! \brief Load 4 float values from aligned memory into SIMD4 variable.
 126  *
 127  * \param m Pointer to memory aligned to 4 elements.
 128  * \return SIMD4 variable with data loaded.
 129  */
 130 static inline Simd4Float gmx_simdcall load4(const float* m)
 131 {
 132     Simd4Float a;
 133
 134     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
 135
 136     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 137     return a;
 138 }
 139
 140 /*! \brief Store the contents of SIMD4 float to aligned memory m.
 141  *
 142  * \param[out] m Pointer to memory, aligned to 4 elements.
 143  * \param a SIMD4 variable to store
 144  */
 145 static inline void gmx_simdcall store4(float* m, Simd4Float a)
 146 {
 147     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
 148
 149     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 150 }
 151
 152 /*! \brief Load SIMD4 float from unaligned memory.
 153  *
 154  * Available if \ref GMX_SIMD_HAVE_LOADU is 1.
 155  *
 156  * \param m Pointer to memory, no alignment requirement.
 157  * \return SIMD4 variable with data loaded.
 158  */
 159 static inline Simd4Float gmx_simdcall load4U(const float* m)
 160 {
 161     Simd4Float a;
 162     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 163     return a;
 164 }
 165
 166 /*! \brief Store SIMD4 float to unaligned memory.
 167  *
 168  * Available if \ref GMX_SIMD_HAVE_STOREU is 1.
 169  *
 170  * \param[out] m Pointer to memory, no alignment requirement.
 171  * \param a SIMD4 variable to store.
 172  */
 173 static inline void gmx_simdcall store4U(float* m, Simd4Float a)
 174 {
 175     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 176 }
 177
 178 /*! \brief Set all SIMD4 float elements to 0.
 179  *
 180  * You should typically just call \ref gmx::setZero(), which uses proxy objects
 181  * internally to handle all types rather than adding the suffix used here.
 182  *
 183  * \return SIMD4 0.0
 184  */
 185 static inline Simd4Float gmx_simdcall simd4SetZeroF()
 186 {
 187     return Simd4Float(0.0F);
 188 }
 189
 190
 191 /*! \brief Bitwise and for two SIMD4 float variables.
 192  *
 193  * Supported if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 194  *
 195  * \param a data1
 196  * \param b data2
 197  * \return data1 & data2
 198  */
 199 static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
 200 {
 201     Simd4Float res;
 202
 203     union
 204     {
 205         float        r;
 206         std::int32_t i;
 207     } conv1, conv2;
 208
 209     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 210     {
 211         conv1.r              = a.simdInternal_[i];
 212         conv2.r              = b.simdInternal_[i];
 213         conv1.i              = conv1.i & conv2.i;
 214         res.simdInternal_[i] = conv1.r;
 215     }
 216     return res;
 217 }
 218
 219
 220 /*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
 221  *
 222  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 223  *
 224  * \param a data1
 225  * \param b data2
 226  * \return (~data1) & data2
 227  */
 228 static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
 229 {
 230     Simd4Float res;
 231
 232     union
 233     {
 234         float        r;
 235         std::int32_t i;
 236     } conv1, conv2;
 237
 238     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 239     {
 240         conv1.r              = a.simdInternal_[i];
 241         conv2.r              = b.simdInternal_[i];
 242         conv1.i              = ~conv1.i & conv2.i;
 243         res.simdInternal_[i] = conv1.r;
 244     }
 245     return res;
 246 }
 247
 248
 249 /*! \brief Bitwise or for two SIMD4 floats.
 250  *
 251  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 252  *
 253  * \param a data1
 254  * \param b data2
 255  * \return data1 | data2
 256  */
 257 static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
 258 {
 259     Simd4Float res;
 260
 261     union
 262     {
 263         float        r;
 264         std::int32_t i;
 265     } conv1, conv2;
 266
 267     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 268     {
 269         conv1.r              = a.simdInternal_[i];
 270         conv2.r              = b.simdInternal_[i];
 271         conv1.i              = conv1.i | conv2.i;
 272         res.simdInternal_[i] = conv1.r;
 273     }
 274     return res;
 275 }
 276
 277 /*! \brief Bitwise xor for two SIMD4 float variables.
 278  *
 279  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 280  *
 281  * \param a data1
 282  * \param b data2
 283  * \return data1 ^ data2
 284  */
 285 static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
 286 {
 287     Simd4Float res;
 288
 289     union
 290     {
 291         float        r;
 292         std::int32_t i;
 293     } conv1, conv2;
 294
 295     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 296     {
 297         conv1.r              = a.simdInternal_[i];
 298         conv2.r              = b.simdInternal_[i];
 299         conv1.i              = conv1.i ^ conv2.i;
 300         res.simdInternal_[i] = conv1.r;
 301     }
 302     return res;
 303 }
 304
 305 /*! \brief Add two float SIMD4 variables.
 306  *
 307  * \param a term1
 308  * \param b term2
 309  * \return a+b
 310  */
 311 static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
 312 {
 313     Simd4Float res;
 314
 315     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 316     {
 317         res.simdInternal_[i] = a.simdInternal_[i] + b.simdInternal_[i];
 318     }
 319     return res;
 320 }
 321
 322 /*! \brief Subtract two SIMD4 variables.
 323  *
 324  * \param a term1
 325  * \param b term2
 326  * \return a-b
 327  */
 328 static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
 329 {
 330     Simd4Float res;
 331
 332     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 333     {
 334         res.simdInternal_[i] = a.simdInternal_[i] - b.simdInternal_[i];
 335     }
 336     return res;
 337 }
 338
 339 /*! \brief SIMD4 floating-point negate.
 340  *
 341  * \param a SIMD4 floating-point value
 342  * \return -a
 343  */
 344 static inline Simd4Float gmx_simdcall operator-(Simd4Float a)
 345 {
 346     Simd4Float res;
 347
 348     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 349     {
 350         res.simdInternal_[i] = -a.simdInternal_[i];
 351     }
 352     return res;
 353 }
 354
 355 /*! \brief Multiply two SIMD4 variables.
 356  *
 357  * \param a factor1
 358  * \param b factor2
 359  * \return a*b.
 360  */
 361 static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
 362 {
 363     Simd4Float res;
 364
 365     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 366     {
 367         res.simdInternal_[i] = a.simdInternal_[i] * b.simdInternal_[i];
 368     }
 369     return res;
 370 }
 371
 372 /*! \brief SIMD4 Fused-multiply-add. Result is a*b+c.
 373  *
 374  * \param a factor1
 375  * \param b factor2
 376  * \param c term
 377  * \return a*b+c
 378  */
 379 static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
 380 {
 381     return a * b + c;
 382 }
 383
 384 /*! \brief SIMD4 Fused-multiply-subtract. Result is a*b-c.
 385  *
 386  * \param a factor1
 387  * \param b factor2
 388  * \param c term
 389  * \return a*b-c
 390  */
 391 static inline Simd4Float gmx_simdcall fms(Simd4Float a, Simd4Float b, Simd4Float c)
 392 {
 393     return a * b - c;
 394 }
 395
 396 /*! \brief SIMD4 Fused-negated-multiply-add. Result is -a*b+c.
 397  *
 398  * \param a factor1
 399  * \param b factor2
 400  * \param c term
 401  * \return -a*b+c
 402  */
 403 static inline Simd4Float gmx_simdcall fnma(Simd4Float a, Simd4Float b, Simd4Float c)
 404 {
 405     return c - a * b;
 406 }
 407
 408 /*! \brief SIMD4 Fused-negated-multiply-subtract. Result is -a*b-c.
 409  *
 410  * \param a factor1
 411  * \param b factor2
 412  * \param c term
 413  * \return -a*b-c
 414  */
 415 static inline Simd4Float gmx_simdcall fnms(Simd4Float a, Simd4Float b, Simd4Float c)
 416 {
 417     return -a * b - c;
 418 }
 419
 420 /*! \brief SIMD4 1.0/sqrt(x) lookup.
 421  *
 422  * This is a low-level instruction that should only be called from routines
 423  * implementing the inverse square root in simd_math.h.
 424  *
 425  * \param x Argument, x>0
 426  * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
 427  */
 428 static inline Simd4Float gmx_simdcall rsqrt(Simd4Float x)
 429 {
 430     Simd4Float res;
 431
 432     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 433     {
 434         res.simdInternal_[i] = 1.0F / std::sqrt(x.simdInternal_[i]);
 435     }
 436     return res;
 437 };
 438
 439
 440 /*! \brief SIMD4 Floating-point fabs().
 441  *
 442  * \param a any floating point values
 443  * \return fabs(a) for each element.
 444  */
 445 static inline Simd4Float gmx_simdcall abs(Simd4Float a)
 446 {
 447     Simd4Float res;
 448
 449     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 450     {
 451         res.simdInternal_[i] = std::abs(a.simdInternal_[i]);
 452     }
 453     return res;
 454 }
 455
 456 /*! \brief Set each SIMD4 element to the largest from two variables.
 457  *
 458  * \param a Any floating-point value
 459  * \param b Any floating-point value
 460  * \return max(a,b) for each element.
 461  */
 462 static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
 463 {
 464     Simd4Float res;
 465
 466     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 467     {
 468         res.simdInternal_[i] = std::max(a.simdInternal_[i], b.simdInternal_[i]);
 469     }
 470     return res;
 471 }
 472
 473
 474 /*! \brief Set each SIMD4 element to the largest from two variables.
 475  *
 476  * \param a Any floating-point value
 477  * \param b Any floating-point value
 478  * \return max(a,b) for each element.
 479  */
 480 static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
 481 {
 482     Simd4Float res;
 483
 484     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 485     {
 486         res.simdInternal_[i] = std::min(a.simdInternal_[i], b.simdInternal_[i]);
 487     }
 488     return res;
 489 }
 490
 491
 492 /*! \brief SIMD4 Round to nearest integer value (in floating-point format).
 493  *
 494  * \param a Any floating-point value
 495  * \return The nearest integer, represented in floating-point format.
 496  */
 497 static inline Simd4Float gmx_simdcall round(Simd4Float a)
 498 {
 499     Simd4Float res;
 500
 501     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 502     {
 503         res.simdInternal_[i] = std::round(a.simdInternal_[i]);
 504     }
 505     return res;
 506 }
 507
 508
 509 /*! \brief Truncate SIMD4, i.e. round towards zero - common hardware instruction.
 510  *
 511  * \param a Any floating-point value
 512  * \return Integer rounded towards zero, represented in floating-point format.
 513  *
 514  * \note This is truncation towards zero, not floor(). The reason for this
 515  * is that truncation is virtually always present as a dedicated hardware
 516  * instruction, but floor() frequently isn't.
 517  */
 518 static inline Simd4Float gmx_simdcall trunc(Simd4Float a)
 519 {
 520     Simd4Float res;
 521
 522     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 523     {
 524         res.simdInternal_[i] = std::trunc(a.simdInternal_[i]);
 525     }
 526     return res;
 527 }
 528
 529 /*! \brief Return dot product of two single precision SIMD4 variables.
 530  *
 531  * The dot product is calculated between the first three elements in the two
 532  * vectors, while the fourth is ignored. The result is returned as a scalar.
 533  *
 534  * \param a vector1
 535  * \param b vector2
 536  * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
 537  */
 538 static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
 539 {
 540     return (a.simdInternal_[0] * b.simdInternal_[0] + a.simdInternal_[1] * b.simdInternal_[1]
 541             + a.simdInternal_[2] * b.simdInternal_[2]);
 542 }
 543
 544 /*! \brief SIMD4 float transpose
 545  *
 546  * \param[in,out] v0  Row 0 on input, column 0 on output
 547  * \param[in,out] v1  Row 1 on input, column 1 on output
 548  * \param[in,out] v2  Row 2 on input, column 2 on output
 549  * \param[in,out] v3  Row 3 on input, column 3 on output
 550  */
 551 static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
 552 {
 553     Simd4Float t0        = *v0;
 554     Simd4Float t1        = *v1;
 555     Simd4Float t2        = *v2;
 556     Simd4Float t3        = *v3;
 557     v0->simdInternal_[0] = t0.simdInternal_[0];
 558     v0->simdInternal_[1] = t1.simdInternal_[0];
 559     v0->simdInternal_[2] = t2.simdInternal_[0];
 560     v0->simdInternal_[3] = t3.simdInternal_[0];
 561     v1->simdInternal_[0] = t0.simdInternal_[1];
 562     v1->simdInternal_[1] = t1.simdInternal_[1];
 563     v1->simdInternal_[2] = t2.simdInternal_[1];
 564     v1->simdInternal_[3] = t3.simdInternal_[1];
 565     v2->simdInternal_[0] = t0.simdInternal_[2];
 566     v2->simdInternal_[1] = t1.simdInternal_[2];
 567     v2->simdInternal_[2] = t2.simdInternal_[2];
 568     v2->simdInternal_[3] = t3.simdInternal_[2];
 569     v3->simdInternal_[0] = t0.simdInternal_[3];
 570     v3->simdInternal_[1] = t1.simdInternal_[3];
 571     v3->simdInternal_[2] = t2.simdInternal_[3];
 572     v3->simdInternal_[3] = t3.simdInternal_[3];
 573 }
 574
 575 /*! \brief a==b for SIMD4 float
 576  *
 577  * \param a value1
 578  * \param b value2
 579  * \return Each element of the boolean will be set to true if a==b.
 580  */
 581 static inline Simd4FBool gmx_simdcall operator==(Simd4Float a, Simd4Float b)
 582 {
 583     Simd4FBool res;
 584
 585     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 586     {
 587         res.simdInternal_[i] = (a.simdInternal_[i] == b.simdInternal_[i]);
 588     }
 589     return res;
 590 }
 591
 592 /*! \brief a!=b for SIMD4 float
 593  *
 594  * \param a value1
 595  * \param b value2
 596  * \return Each element of the boolean will be set to true if a!=b.
 597  */
 598 static inline Simd4FBool gmx_simdcall operator!=(Simd4Float a, Simd4Float b)
 599 {
 600     Simd4FBool res;
 601
 602     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 603     {
 604         res.simdInternal_[i] = (a.simdInternal_[i] != b.simdInternal_[i]);
 605     }
 606     return res;
 607 }
 608
 609 /*! \brief a<b for SIMD4 float
 610  *
 611  * \param a value1
 612  * \param b value2
 613  * \return Each element of the boolean will be set to true if a<b.
 614  */
 615 static inline Simd4FBool gmx_simdcall operator<(Simd4Float a, Simd4Float b)
 616 {
 617     Simd4FBool res;
 618
 619     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 620     {
 621         res.simdInternal_[i] = (a.simdInternal_[i] < b.simdInternal_[i]);
 622     }
 623     return res;
 624 }
 625
 626
 627 /*! \brief a<=b for SIMD4 float.
 628  *
 629  * \param a value1
 630  * \param b value2
 631  * \return Each element of the boolean will be set to true if a<=b.
 632  */
 633 static inline Simd4FBool gmx_simdcall operator<=(Simd4Float a, Simd4Float b)
 634 {
 635     Simd4FBool res;
 636
 637     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 638     {
 639         res.simdInternal_[i] = (a.simdInternal_[i] <= b.simdInternal_[i]);
 640     }
 641     return res;
 642 }
 643
 644 /*! \brief Logical \a and on single precision SIMD4 booleans.
 645  *
 646  * \param a logical vars 1
 647  * \param b logical vars 2
 648  * \return For each element, the result boolean is true if a \& b are true.
 649  *
 650  * \note This is not necessarily a bitwise operation - the storage format
 651  * of booleans is implementation-dependent.
 652  */
 653 static inline Simd4FBool gmx_simdcall operator&&(Simd4FBool a, Simd4FBool b)
 654 {
 655     Simd4FBool res;
 656
 657     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 658     {
 659         res.simdInternal_[i] = (a.simdInternal_[i] && b.simdInternal_[i]);
 660     }
 661     return res;
 662 }
 663
 664 /*! \brief Logical \a or on single precision SIMD4 booleans.
 665  *
 666  * \param a logical vars 1
 667  * \param b logical vars 2
 668  * \return For each element, the result boolean is true if a or b is true.
 669  *
 670  * Note that this is not necessarily a bitwise operation - the storage format
 671  * of booleans is implementation-dependent.
 672  */
 673 static inline Simd4FBool gmx_simdcall operator||(Simd4FBool a, Simd4FBool b)
 674 {
 675     Simd4FBool res;
 676
 677     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 678     {
 679         res.simdInternal_[i] = (a.simdInternal_[i] || b.simdInternal_[i]);
 680     }
 681     return res;
 682 }
 683
 684 /*! \brief Returns non-zero if any of the boolean in SIMD4 a is True, otherwise 0.
 685  *
 686  * \param a Logical variable.
 687  * \return true if any element in a is true, otherwise false.
 688  *
 689  * The actual return value for truth will depend on the architecture,
 690  * so any non-zero value is considered truth.
 691  */
 692 static inline bool gmx_simdcall anyTrue(Simd4FBool a)
 693 {
 694     bool res = false;
 695
 696     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
 697     {
 698         res = res || a.simdInternal_[i];
 699     }
 700     return res;
 701 }
 702
 703 /*! \brief Select from single precision SIMD4 variable where boolean is true.
 704  *
 705  * \param a Floating-point variable to select from
 706  * \param mask Boolean selector
 707  * \return  For each element, a is selected for true, 0 for false.
 708  */
 709 static inline Simd4Float gmx_simdcall selectByMask(Simd4Float a, Simd4FBool mask)
 710 {
 711     Simd4Float res;
 712
 713     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 714     {
 715         res.simdInternal_[i] = mask.simdInternal_[i] ? a.simdInternal_[i] : 0.0F;
 716     }
 717     return res;
 718 }
 719
 720 /*! \brief Select from single precision SIMD4 variable where boolean is false.
 721  *
 722  * \param a Floating-point variable to select from
 723  * \param mask Boolean selector
 724  * \return  For each element, a is selected for false, 0 for true (sic).
 725  */
 726 static inline Simd4Float gmx_simdcall selectByNotMask(Simd4Float a, Simd4FBool mask)
 727 {
 728     Simd4Float res;
 729
 730     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 731     {
 732         res.simdInternal_[i] = mask.simdInternal_[i] ? 0.0F : a.simdInternal_[i];
 733     }
 734     return res;
 735 }
 736
 737
 738 /*! \brief Vector-blend SIMD4 selection.
 739  *
 740  * \param a First source
 741  * \param b Second source
 742  * \param sel Boolean selector
 743  * \return For each element, select b if sel is true, a otherwise.
 744  */
 745 static inline Simd4Float gmx_simdcall blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
 746 {
 747     Simd4Float res;
 748
 749     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 750     {
 751         res.simdInternal_[i] = sel.simdInternal_[i] ? b.simdInternal_[i] : a.simdInternal_[i];
 752     }
 753     return res;
 754 }
 755
 756
 757 /*! \brief Return sum of all elements in SIMD4 float variable.
 758  *
 759  * \param a SIMD4 variable to reduce/sum.
 760  * \return The sum of all elements in the argument variable.
 761  *
 762  */
 763 static inline float gmx_simdcall reduce(Simd4Float a)
 764 {
 765     float sum = 0.0F;
 766
 767     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
 768     {
 769         sum += a.simdInternal_[i];
 770     }
 771     return sum;
 772 }
 773
 774 /*! \} */
 775
 776 /*! \} */
 777 /*! \endcond */
 778
 779 } // namespace gmx
 780
 781 #endif // GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H