src/gromacs/simd/impl_reference/impl_reference_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
  37 #define GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference implementation, SIMD4 single precision.
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cmath>
  52 #include <cstddef>
  53 #include <cstdint>
  54
  55 #include <algorithm>
  56 #include <array>
  57
  58 #include "impl_reference_definitions.h"
  59
  60 namespace gmx
  61 {
  62
  63 /*! \cond libapi */
  64 /*! \addtogroup module_simd */
  65 /*! \{ */
  66
  67 /*! \name Constant width-4 single precision SIMD types and instructions
  68  * \{
  69  */
  70
  71 /*! \libinternal \brief SIMD4 float type.
  72  *
  73  * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
  74  *
  75  * \note This variable cannot be placed inside other structures or classes, since
  76  *       some compilers (including at least clang-3.7) appear to lose the
  77  *       alignment. This is likely particularly severe when allocating such
  78  *       memory on the heap, but it occurs for stack structures too.
  79  */
  80 class Simd4Float
  81 {
  82 public:
  83     Simd4Float() {}
  84
  85     //! \brief Construct from scalar
  86     Simd4Float(float f) { simdInternal_.fill(f); }
  87
  88     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
  89      *
  90      * This has to be public to enable usage in combination with static inline
  91      * functions, but it should never, EVER, be accessed by any code outside
  92      * the corresponding implementation directory since the type will depend
  93      * on the architecture.
  94      */
  95     std::array<float, GMX_SIMD4_WIDTH> simdInternal_;
  96 };
  97
  98 /*! \libinternal  \brief SIMD4 variable type to use for logical comparisons on floats.
  99  *
 100  * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
 101  *
 102  * \note This variable cannot be placed inside other structures or classes, since
 103  *       some compilers (including at least clang-3.7) appear to lose the
 104  *       alignment. This is likely particularly severe when allocating such
 105  *       memory on the heap, but it occurs for stack structures too.
 106  */
 107 class Simd4FBool
 108 {
 109 public:
 110     Simd4FBool() {}
 111
 112     //! \brief Construct from scalar bool
 113     Simd4FBool(bool b) { simdInternal_.fill(b); }
 114
 115     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
 116      *
 117      * This has to be public to enable usage in combination with static inline
 118      * functions, but it should never, EVER, be accessed by any code outside
 119      * the corresponding implementation directory since the type will depend
 120      * on the architecture.
 121      */
 122     std::array<bool, GMX_SIMD4_WIDTH> simdInternal_;
 123 };
 124
 125 /*! \brief Load 4 float values from aligned memory into SIMD4 variable.
 126  *
 127  * \param m Pointer to memory aligned to 4 elements.
 128  * \return SIMD4 variable with data loaded.
 129  */
 130 static inline Simd4Float gmx_simdcall load4(const float* m)
 131 {
 132     Simd4Float a;
 133
 134     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
 135
 136     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 137     return a;
 138 }
 139
 140 /*! \brief Store the contents of SIMD4 float to aligned memory m.
 141  *
 142  * \param[out] m Pointer to memory, aligned to 4 elements.
 143  * \param a SIMD4 variable to store
 144  */
 145 static inline void gmx_simdcall store4(float* m, Simd4Float a)
 146 {
 147     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
 148
 149     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 150 }
 151
 152 /*! \brief Load SIMD4 float from unaligned memory.
 153  *
 154  * Available if \ref GMX_SIMD_HAVE_LOADU is 1.
 155  *
 156  * \param m Pointer to memory, no alignment requirement.
 157  * \return SIMD4 variable with data loaded.
 158  */
 159 static inline Simd4Float gmx_simdcall load4U(const float* m)
 160 {
 161     Simd4Float a;
 162     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 163     return a;
 164 }
 165
 166 /*! \brief Store SIMD4 float to unaligned memory.
 167  *
 168  * Available if \ref GMX_SIMD_HAVE_STOREU is 1.
 169  *
 170  * \param[out] m Pointer to memory, no alignment requirement.
 171  * \param a SIMD4 variable to store.
 172  */
 173 static inline void gmx_simdcall store4U(float* m, Simd4Float a)
 174 {
 175     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 176 }
 177
 178 /*! \brief Set all SIMD4 float elements to 0.
 179  *
 180  * You should typically just call \ref gmx::setZero(), which uses proxy objects
 181  * internally to handle all types rather than adding the suffix used here.
 182  *
 183  * \return SIMD4 0.0
 184  */
 185 static inline Simd4Float gmx_simdcall simd4SetZeroF()
 186 {
 187     return Simd4Float(0.0F);
 188 }
 189
 190
 191 /*! \brief Bitwise and for two SIMD4 float variables.
 192  *
 193  * Supported if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 194  *
 195  * \param a data1
 196  * \param b data2
 197  * \return data1 & data2
 198  */
 199 static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
 200 {
 201     Simd4Float res;
 202
 203     union {
 204         float        r;
 205         std::int32_t i;
 206     } conv1, conv2;
 207
 208     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 209     {
 210         conv1.r              = a.simdInternal_[i];
 211         conv2.r              = b.simdInternal_[i];
 212         conv1.i              = conv1.i & conv2.i;
 213         res.simdInternal_[i] = conv1.r;
 214     }
 215     return res;
 216 }
 217
 218
 219 /*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
 220  *
 221  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 222  *
 223  * \param a data1
 224  * \param b data2
 225  * \return (~data1) & data2
 226  */
 227 static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
 228 {
 229     Simd4Float res;
 230
 231     union {
 232         float        r;
 233         std::int32_t i;
 234     } conv1, conv2;
 235
 236     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 237     {
 238         conv1.r              = a.simdInternal_[i];
 239         conv2.r              = b.simdInternal_[i];
 240         conv1.i              = ~conv1.i & conv2.i;
 241         res.simdInternal_[i] = conv1.r;
 242     }
 243     return res;
 244 }
 245
 246
 247 /*! \brief Bitwise or for two SIMD4 floats.
 248  *
 249  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 250  *
 251  * \param a data1
 252  * \param b data2
 253  * \return data1 | data2
 254  */
 255 static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
 256 {
 257     Simd4Float res;
 258
 259     union {
 260         float        r;
 261         std::int32_t i;
 262     } conv1, conv2;
 263
 264     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 265     {
 266         conv1.r              = a.simdInternal_[i];
 267         conv2.r              = b.simdInternal_[i];
 268         conv1.i              = conv1.i | conv2.i;
 269         res.simdInternal_[i] = conv1.r;
 270     }
 271     return res;
 272 }
 273
 274 /*! \brief Bitwise xor for two SIMD4 float variables.
 275  *
 276  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 277  *
 278  * \param a data1
 279  * \param b data2
 280  * \return data1 ^ data2
 281  */
 282 static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
 283 {
 284     Simd4Float res;
 285
 286     union {
 287         float        r;
 288         std::int32_t i;
 289     } conv1, conv2;
 290
 291     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 292     {
 293         conv1.r              = a.simdInternal_[i];
 294         conv2.r              = b.simdInternal_[i];
 295         conv1.i              = conv1.i ^ conv2.i;
 296         res.simdInternal_[i] = conv1.r;
 297     }
 298     return res;
 299 }
 300
 301 /*! \brief Add two float SIMD4 variables.
 302  *
 303  * \param a term1
 304  * \param b term2
 305  * \return a+b
 306  */
 307 static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
 308 {
 309     Simd4Float res;
 310
 311     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 312     {
 313         res.simdInternal_[i] = a.simdInternal_[i] + b.simdInternal_[i];
 314     }
 315     return res;
 316 }
 317
 318 /*! \brief Subtract two SIMD4 variables.
 319  *
 320  * \param a term1
 321  * \param b term2
 322  * \return a-b
 323  */
 324 static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
 325 {
 326     Simd4Float res;
 327
 328     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 329     {
 330         res.simdInternal_[i] = a.simdInternal_[i] - b.simdInternal_[i];
 331     }
 332     return res;
 333 }
 334
 335 /*! \brief SIMD4 floating-point negate.
 336  *
 337  * \param a SIMD4 floating-point value
 338  * \return -a
 339  */
 340 static inline Simd4Float gmx_simdcall operator-(Simd4Float a)
 341 {
 342     Simd4Float res;
 343
 344     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 345     {
 346         res.simdInternal_[i] = -a.simdInternal_[i];
 347     }
 348     return res;
 349 }
 350
 351 /*! \brief Multiply two SIMD4 variables.
 352  *
 353  * \param a factor1
 354  * \param b factor2
 355  * \return a*b.
 356  */
 357 static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
 358 {
 359     Simd4Float res;
 360
 361     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 362     {
 363         res.simdInternal_[i] = a.simdInternal_[i] * b.simdInternal_[i];
 364     }
 365     return res;
 366 }
 367
 368 /*! \brief SIMD4 Fused-multiply-add. Result is a*b+c.
 369  *
 370  * \param a factor1
 371  * \param b factor2
 372  * \param c term
 373  * \return a*b+c
 374  */
 375 static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
 376 {
 377     return a * b + c;
 378 }
 379
 380 /*! \brief SIMD4 Fused-multiply-subtract. Result is a*b-c.
 381  *
 382  * \param a factor1
 383  * \param b factor2
 384  * \param c term
 385  * \return a*b-c
 386  */
 387 static inline Simd4Float gmx_simdcall fms(Simd4Float a, Simd4Float b, Simd4Float c)
 388 {
 389     return a * b - c;
 390 }
 391
 392 /*! \brief SIMD4 Fused-negated-multiply-add. Result is -a*b+c.
 393  *
 394  * \param a factor1
 395  * \param b factor2
 396  * \param c term
 397  * \return -a*b+c
 398  */
 399 static inline Simd4Float gmx_simdcall fnma(Simd4Float a, Simd4Float b, Simd4Float c)
 400 {
 401     return c - a * b;
 402 }
 403
 404 /*! \brief SIMD4 Fused-negated-multiply-subtract. Result is -a*b-c.
 405  *
 406  * \param a factor1
 407  * \param b factor2
 408  * \param c term
 409  * \return -a*b-c
 410  */
 411 static inline Simd4Float gmx_simdcall fnms(Simd4Float a, Simd4Float b, Simd4Float c)
 412 {
 413     return -a * b - c;
 414 }
 415
 416 /*! \brief SIMD4 1.0/sqrt(x) lookup.
 417  *
 418  * This is a low-level instruction that should only be called from routines
 419  * implementing the inverse square root in simd_math.h.
 420  *
 421  * \param x Argument, x>0
 422  * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
 423  */
 424 static inline Simd4Float gmx_simdcall rsqrt(Simd4Float x)
 425 {
 426     Simd4Float res;
 427
 428     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 429     {
 430         res.simdInternal_[i] = 1.0F / std::sqrt(x.simdInternal_[i]);
 431     }
 432     return res;
 433 };
 434
 435
 436 /*! \brief SIMD4 Floating-point fabs().
 437  *
 438  * \param a any floating point values
 439  * \return fabs(a) for each element.
 440  */
 441 static inline Simd4Float gmx_simdcall abs(Simd4Float a)
 442 {
 443     Simd4Float res;
 444
 445     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 446     {
 447         res.simdInternal_[i] = std::abs(a.simdInternal_[i]);
 448     }
 449     return res;
 450 }
 451
 452 /*! \brief Set each SIMD4 element to the largest from two variables.
 453  *
 454  * \param a Any floating-point value
 455  * \param b Any floating-point value
 456  * \return max(a,b) for each element.
 457  */
 458 static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
 459 {
 460     Simd4Float res;
 461
 462     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 463     {
 464         res.simdInternal_[i] = std::max(a.simdInternal_[i], b.simdInternal_[i]);
 465     }
 466     return res;
 467 }
 468
 469
 470 /*! \brief Set each SIMD4 element to the largest from two variables.
 471  *
 472  * \param a Any floating-point value
 473  * \param b Any floating-point value
 474  * \return max(a,b) for each element.
 475  */
 476 static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
 477 {
 478     Simd4Float res;
 479
 480     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 481     {
 482         res.simdInternal_[i] = std::min(a.simdInternal_[i], b.simdInternal_[i]);
 483     }
 484     return res;
 485 }
 486
 487
 488 /*! \brief SIMD4 Round to nearest integer value (in floating-point format).
 489  *
 490  * \param a Any floating-point value
 491  * \return The nearest integer, represented in floating-point format.
 492  */
 493 static inline Simd4Float gmx_simdcall round(Simd4Float a)
 494 {
 495     Simd4Float res;
 496
 497     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 498     {
 499         res.simdInternal_[i] = std::round(a.simdInternal_[i]);
 500     }
 501     return res;
 502 }
 503
 504
 505 /*! \brief Truncate SIMD4, i.e. round towards zero - common hardware instruction.
 506  *
 507  * \param a Any floating-point value
 508  * \return Integer rounded towards zero, represented in floating-point format.
 509  *
 510  * \note This is truncation towards zero, not floor(). The reason for this
 511  * is that truncation is virtually always present as a dedicated hardware
 512  * instruction, but floor() frequently isn't.
 513  */
 514 static inline Simd4Float gmx_simdcall trunc(Simd4Float a)
 515 {
 516     Simd4Float res;
 517
 518     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 519     {
 520         res.simdInternal_[i] = std::trunc(a.simdInternal_[i]);
 521     }
 522     return res;
 523 }
 524
 525 /*! \brief Return dot product of two single precision SIMD4 variables.
 526  *
 527  * The dot product is calculated between the first three elements in the two
 528  * vectors, while the fourth is ignored. The result is returned as a scalar.
 529  *
 530  * \param a vector1
 531  * \param b vector2
 532  * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
 533  */
 534 static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
 535 {
 536     return (a.simdInternal_[0] * b.simdInternal_[0] + a.simdInternal_[1] * b.simdInternal_[1]
 537             + a.simdInternal_[2] * b.simdInternal_[2]);
 538 }
 539
 540 /*! \brief SIMD4 float transpose
 541  *
 542  * \param[in,out] v0  Row 0 on input, column 0 on output
 543  * \param[in,out] v1  Row 1 on input, column 1 on output
 544  * \param[in,out] v2  Row 2 on input, column 2 on output
 545  * \param[in,out] v3  Row 3 on input, column 3 on output
 546  */
 547 static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
 548 {
 549     Simd4Float t0        = *v0;
 550     Simd4Float t1        = *v1;
 551     Simd4Float t2        = *v2;
 552     Simd4Float t3        = *v3;
 553     v0->simdInternal_[0] = t0.simdInternal_[0];
 554     v0->simdInternal_[1] = t1.simdInternal_[0];
 555     v0->simdInternal_[2] = t2.simdInternal_[0];
 556     v0->simdInternal_[3] = t3.simdInternal_[0];
 557     v1->simdInternal_[0] = t0.simdInternal_[1];
 558     v1->simdInternal_[1] = t1.simdInternal_[1];
 559     v1->simdInternal_[2] = t2.simdInternal_[1];
 560     v1->simdInternal_[3] = t3.simdInternal_[1];
 561     v2->simdInternal_[0] = t0.simdInternal_[2];
 562     v2->simdInternal_[1] = t1.simdInternal_[2];
 563     v2->simdInternal_[2] = t2.simdInternal_[2];
 564     v2->simdInternal_[3] = t3.simdInternal_[2];
 565     v3->simdInternal_[0] = t0.simdInternal_[3];
 566     v3->simdInternal_[1] = t1.simdInternal_[3];
 567     v3->simdInternal_[2] = t2.simdInternal_[3];
 568     v3->simdInternal_[3] = t3.simdInternal_[3];
 569 }
 570
 571 /*! \brief a==b for SIMD4 float
 572  *
 573  * \param a value1
 574  * \param b value2
 575  * \return Each element of the boolean will be set to true if a==b.
 576  */
 577 static inline Simd4FBool gmx_simdcall operator==(Simd4Float a, Simd4Float b)
 578 {
 579     Simd4FBool res;
 580
 581     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 582     {
 583         res.simdInternal_[i] = (a.simdInternal_[i] == b.simdInternal_[i]);
 584     }
 585     return res;
 586 }
 587
 588 /*! \brief a!=b for SIMD4 float
 589  *
 590  * \param a value1
 591  * \param b value2
 592  * \return Each element of the boolean will be set to true if a!=b.
 593  */
 594 static inline Simd4FBool gmx_simdcall operator!=(Simd4Float a, Simd4Float b)
 595 {
 596     Simd4FBool res;
 597
 598     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 599     {
 600         res.simdInternal_[i] = (a.simdInternal_[i] != b.simdInternal_[i]);
 601     }
 602     return res;
 603 }
 604
 605 /*! \brief a<b for SIMD4 float
 606  *
 607  * \param a value1
 608  * \param b value2
 609  * \return Each element of the boolean will be set to true if a<b.
 610  */
 611 static inline Simd4FBool gmx_simdcall operator<(Simd4Float a, Simd4Float b)
 612 {
 613     Simd4FBool res;
 614
 615     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 616     {
 617         res.simdInternal_[i] = (a.simdInternal_[i] < b.simdInternal_[i]);
 618     }
 619     return res;
 620 }
 621
 622
 623 /*! \brief a<=b for SIMD4 float.
 624  *
 625  * \param a value1
 626  * \param b value2
 627  * \return Each element of the boolean will be set to true if a<=b.
 628  */
 629 static inline Simd4FBool gmx_simdcall operator<=(Simd4Float a, Simd4Float b)
 630 {
 631     Simd4FBool res;
 632
 633     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 634     {
 635         res.simdInternal_[i] = (a.simdInternal_[i] <= b.simdInternal_[i]);
 636     }
 637     return res;
 638 }
 639
 640 /*! \brief Logical \a and on single precision SIMD4 booleans.
 641  *
 642  * \param a logical vars 1
 643  * \param b logical vars 2
 644  * \return For each element, the result boolean is true if a \& b are true.
 645  *
 646  * \note This is not necessarily a bitwise operation - the storage format
 647  * of booleans is implementation-dependent.
 648  */
 649 static inline Simd4FBool gmx_simdcall operator&&(Simd4FBool a, Simd4FBool b)
 650 {
 651     Simd4FBool res;
 652
 653     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 654     {
 655         res.simdInternal_[i] = (a.simdInternal_[i] && b.simdInternal_[i]);
 656     }
 657     return res;
 658 }
 659
 660 /*! \brief Logical \a or on single precision SIMD4 booleans.
 661  *
 662  * \param a logical vars 1
 663  * \param b logical vars 2
 664  * \return For each element, the result boolean is true if a or b is true.
 665  *
 666  * Note that this is not necessarily a bitwise operation - the storage format
 667  * of booleans is implementation-dependent.
 668  */
 669 static inline Simd4FBool gmx_simdcall operator||(Simd4FBool a, Simd4FBool b)
 670 {
 671     Simd4FBool res;
 672
 673     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 674     {
 675         res.simdInternal_[i] = (a.simdInternal_[i] || b.simdInternal_[i]);
 676     }
 677     return res;
 678 }
 679
 680 /*! \brief Returns non-zero if any of the boolean in SIMD4 a is True, otherwise 0.
 681  *
 682  * \param a Logical variable.
 683  * \return true if any element in a is true, otherwise false.
 684  *
 685  * The actual return value for truth will depend on the architecture,
 686  * so any non-zero value is considered truth.
 687  */
 688 static inline bool gmx_simdcall anyTrue(Simd4FBool a)
 689 {
 690     bool res = false;
 691
 692     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
 693     {
 694         res = res || a.simdInternal_[i];
 695     }
 696     return res;
 697 }
 698
 699 /*! \brief Select from single precision SIMD4 variable where boolean is true.
 700  *
 701  * \param a Floating-point variable to select from
 702  * \param mask Boolean selector
 703  * \return  For each element, a is selected for true, 0 for false.
 704  */
 705 static inline Simd4Float gmx_simdcall selectByMask(Simd4Float a, Simd4FBool mask)
 706 {
 707     Simd4Float res;
 708
 709     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 710     {
 711         res.simdInternal_[i] = mask.simdInternal_[i] ? a.simdInternal_[i] : 0.0F;
 712     }
 713     return res;
 714 }
 715
 716 /*! \brief Select from single precision SIMD4 variable where boolean is false.
 717  *
 718  * \param a Floating-point variable to select from
 719  * \param mask Boolean selector
 720  * \return  For each element, a is selected for false, 0 for true (sic).
 721  */
 722 static inline Simd4Float gmx_simdcall selectByNotMask(Simd4Float a, Simd4FBool mask)
 723 {
 724     Simd4Float res;
 725
 726     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 727     {
 728         res.simdInternal_[i] = mask.simdInternal_[i] ? 0.0F : a.simdInternal_[i];
 729     }
 730     return res;
 731 }
 732
 733
 734 /*! \brief Vector-blend SIMD4 selection.
 735  *
 736  * \param a First source
 737  * \param b Second source
 738  * \param sel Boolean selector
 739  * \return For each element, select b if sel is true, a otherwise.
 740  */
 741 static inline Simd4Float gmx_simdcall blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
 742 {
 743     Simd4Float res;
 744
 745     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 746     {
 747         res.simdInternal_[i] = sel.simdInternal_[i] ? b.simdInternal_[i] : a.simdInternal_[i];
 748     }
 749     return res;
 750 }
 751
 752
 753 /*! \brief Return sum of all elements in SIMD4 float variable.
 754  *
 755  * \param a SIMD4 variable to reduce/sum.
 756  * \return The sum of all elements in the argument variable.
 757  *
 758  */
 759 static inline float gmx_simdcall reduce(Simd4Float a)
 760 {
 761     float sum = 0.0F;
 762
 763     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
 764     {
 765         sum += a.simdInternal_[i];
 766     }
 767     return sum;
 768 }
 769
 770 /*! \} */
 771
 772 /*! \} */
 773 /*! \endcond */
 774
 775 } // namespace gmx
 776
 777 #endif // GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H