2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2019,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
37 #define GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
39 /*! \libinternal \file
41 * \brief Reference implementation, SIMD4 single precision.
43 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
45 * \ingroup module_simd
58 #include "impl_reference_definitions.h"
64 /*! \addtogroup module_simd */
67 /*! \name Constant width-4 single precision SIMD types and instructions
71 /*! \libinternal \brief SIMD4 float type.
73 * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
75 * \note This variable cannot be placed inside other structures or classes, since
76 * some compilers (including at least clang-3.7) appear to lose the
77 * alignment. This is likely particularly severe when allocating such
78 * memory on the heap, but it occurs for stack structures too.
85 //! \brief Construct from scalar
86 Simd4Float(float f) { simdInternal_.fill(f); }
88 /*! \brief Internal SIMD data. Implementation dependent, don't touch.
90 * This has to be public to enable usage in combination with static inline
91 * functions, but it should never, EVER, be accessed by any code outside
92 * the corresponding implementation directory since the type will depend
93 * on the architecture.
95 std::array<float, GMX_SIMD4_WIDTH> simdInternal_;
98 /*! \libinternal \brief SIMD4 variable type to use for logical comparisons on floats.
100 * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
102 * \note This variable cannot be placed inside other structures or classes, since
103 * some compilers (including at least clang-3.7) appear to lose the
104 * alignment. This is likely particularly severe when allocating such
105 * memory on the heap, but it occurs for stack structures too.
112 //! \brief Construct from scalar bool
113 Simd4FBool(bool b) { simdInternal_.fill(b); }
115 /*! \brief Internal SIMD data. Implementation dependent, don't touch.
117 * This has to be public to enable usage in combination with static inline
118 * functions, but it should never, EVER, be accessed by any code outside
119 * the corresponding implementation directory since the type will depend
120 * on the architecture.
122 std::array<bool, GMX_SIMD4_WIDTH> simdInternal_;
125 /*! \brief Load 4 float values from aligned memory into SIMD4 variable.
127 * \param m Pointer to memory aligned to 4 elements.
128 * \return SIMD4 variable with data loaded.
130 static inline Simd4Float gmx_simdcall load4(const float* m)
134 assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
136 std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
140 /*! \brief Store the contents of SIMD4 float to aligned memory m.
142 * \param[out] m Pointer to memory, aligned to 4 elements.
143 * \param a SIMD4 variable to store
145 static inline void gmx_simdcall store4(float* m, Simd4Float a)
147 assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
149 std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
152 /*! \brief Load SIMD4 float from unaligned memory.
154 * Available if \ref GMX_SIMD_HAVE_LOADU is 1.
156 * \param m Pointer to memory, no alignment requirement.
157 * \return SIMD4 variable with data loaded.
159 static inline Simd4Float gmx_simdcall load4U(const float* m)
162 std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
166 /*! \brief Store SIMD4 float to unaligned memory.
168 * Available if \ref GMX_SIMD_HAVE_STOREU is 1.
170 * \param[out] m Pointer to memory, no alignment requirement.
171 * \param a SIMD4 variable to store.
173 static inline void gmx_simdcall store4U(float* m, Simd4Float a)
175 std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
178 /*! \brief Set all SIMD4 float elements to 0.
180 * You should typically just call \ref gmx::setZero(), which uses proxy objects
181 * internally to handle all types rather than adding the suffix used here.
185 static inline Simd4Float gmx_simdcall simd4SetZeroF()
187 return Simd4Float(0.0F);
191 /*! \brief Bitwise and for two SIMD4 float variables.
193 * Supported if \ref GMX_SIMD_HAVE_LOGICAL is 1.
197 * \return data1 & data2
199 static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
209 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
211 conv1.r = a.simdInternal_[i];
212 conv2.r = b.simdInternal_[i];
213 conv1.i = conv1.i & conv2.i;
214 res.simdInternal_[i] = conv1.r;
220 /*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
222 * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
226 * \return (~data1) & data2
228 static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
238 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
240 conv1.r = a.simdInternal_[i];
241 conv2.r = b.simdInternal_[i];
242 conv1.i = ~conv1.i & conv2.i;
243 res.simdInternal_[i] = conv1.r;
249 /*! \brief Bitwise or for two SIMD4 floats.
251 * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
255 * \return data1 | data2
257 static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
267 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
269 conv1.r = a.simdInternal_[i];
270 conv2.r = b.simdInternal_[i];
271 conv1.i = conv1.i | conv2.i;
272 res.simdInternal_[i] = conv1.r;
277 /*! \brief Bitwise xor for two SIMD4 float variables.
279 * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
283 * \return data1 ^ data2
285 static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
295 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
297 conv1.r = a.simdInternal_[i];
298 conv2.r = b.simdInternal_[i];
299 conv1.i = conv1.i ^ conv2.i;
300 res.simdInternal_[i] = conv1.r;
305 /*! \brief Add two float SIMD4 variables.
311 static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
315 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
317 res.simdInternal_[i] = a.simdInternal_[i] + b.simdInternal_[i];
322 /*! \brief Subtract two SIMD4 variables.
328 static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
332 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
334 res.simdInternal_[i] = a.simdInternal_[i] - b.simdInternal_[i];
339 /*! \brief SIMD4 floating-point negate.
341 * \param a SIMD4 floating-point value
344 static inline Simd4Float gmx_simdcall operator-(Simd4Float a)
348 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
350 res.simdInternal_[i] = -a.simdInternal_[i];
355 /*! \brief Multiply two SIMD4 variables.
361 static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
365 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
367 res.simdInternal_[i] = a.simdInternal_[i] * b.simdInternal_[i];
372 /*! \brief SIMD4 Fused-multiply-add. Result is a*b+c.
379 static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
384 /*! \brief SIMD4 Fused-multiply-subtract. Result is a*b-c.
391 static inline Simd4Float gmx_simdcall fms(Simd4Float a, Simd4Float b, Simd4Float c)
396 /*! \brief SIMD4 Fused-negated-multiply-add. Result is -a*b+c.
403 static inline Simd4Float gmx_simdcall fnma(Simd4Float a, Simd4Float b, Simd4Float c)
408 /*! \brief SIMD4 Fused-negated-multiply-subtract. Result is -a*b-c.
415 static inline Simd4Float gmx_simdcall fnms(Simd4Float a, Simd4Float b, Simd4Float c)
420 /*! \brief SIMD4 1.0/sqrt(x) lookup.
422 * This is a low-level instruction that should only be called from routines
423 * implementing the inverse square root in simd_math.h.
425 * \param x Argument, x>0
426 * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
428 static inline Simd4Float gmx_simdcall rsqrt(Simd4Float x)
432 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
434 res.simdInternal_[i] = 1.0F / std::sqrt(x.simdInternal_[i]);
440 /*! \brief SIMD4 Floating-point fabs().
442 * \param a any floating point values
443 * \return fabs(a) for each element.
445 static inline Simd4Float gmx_simdcall abs(Simd4Float a)
449 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
451 res.simdInternal_[i] = std::abs(a.simdInternal_[i]);
456 /*! \brief Set each SIMD4 element to the largest from two variables.
458 * \param a Any floating-point value
459 * \param b Any floating-point value
460 * \return max(a,b) for each element.
462 static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
466 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
468 res.simdInternal_[i] = std::max(a.simdInternal_[i], b.simdInternal_[i]);
474 /*! \brief Set each SIMD4 element to the largest from two variables.
476 * \param a Any floating-point value
477 * \param b Any floating-point value
478 * \return max(a,b) for each element.
480 static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
484 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
486 res.simdInternal_[i] = std::min(a.simdInternal_[i], b.simdInternal_[i]);
492 /*! \brief SIMD4 Round to nearest integer value (in floating-point format).
494 * \param a Any floating-point value
495 * \return The nearest integer, represented in floating-point format.
497 static inline Simd4Float gmx_simdcall round(Simd4Float a)
501 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
503 res.simdInternal_[i] = std::round(a.simdInternal_[i]);
509 /*! \brief Truncate SIMD4, i.e. round towards zero - common hardware instruction.
511 * \param a Any floating-point value
512 * \return Integer rounded towards zero, represented in floating-point format.
514 * \note This is truncation towards zero, not floor(). The reason for this
515 * is that truncation is virtually always present as a dedicated hardware
516 * instruction, but floor() frequently isn't.
518 static inline Simd4Float gmx_simdcall trunc(Simd4Float a)
522 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
524 res.simdInternal_[i] = std::trunc(a.simdInternal_[i]);
529 /*! \brief Return dot product of two single precision SIMD4 variables.
531 * The dot product is calculated between the first three elements in the two
532 * vectors, while the fourth is ignored. The result is returned as a scalar.
536 * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
538 static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
540 return (a.simdInternal_[0] * b.simdInternal_[0] + a.simdInternal_[1] * b.simdInternal_[1]
541 + a.simdInternal_[2] * b.simdInternal_[2]);
544 /*! \brief SIMD4 float transpose
546 * \param[in,out] v0 Row 0 on input, column 0 on output
547 * \param[in,out] v1 Row 1 on input, column 1 on output
548 * \param[in,out] v2 Row 2 on input, column 2 on output
549 * \param[in,out] v3 Row 3 on input, column 3 on output
551 static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
557 v0->simdInternal_[0] = t0.simdInternal_[0];
558 v0->simdInternal_[1] = t1.simdInternal_[0];
559 v0->simdInternal_[2] = t2.simdInternal_[0];
560 v0->simdInternal_[3] = t3.simdInternal_[0];
561 v1->simdInternal_[0] = t0.simdInternal_[1];
562 v1->simdInternal_[1] = t1.simdInternal_[1];
563 v1->simdInternal_[2] = t2.simdInternal_[1];
564 v1->simdInternal_[3] = t3.simdInternal_[1];
565 v2->simdInternal_[0] = t0.simdInternal_[2];
566 v2->simdInternal_[1] = t1.simdInternal_[2];
567 v2->simdInternal_[2] = t2.simdInternal_[2];
568 v2->simdInternal_[3] = t3.simdInternal_[2];
569 v3->simdInternal_[0] = t0.simdInternal_[3];
570 v3->simdInternal_[1] = t1.simdInternal_[3];
571 v3->simdInternal_[2] = t2.simdInternal_[3];
572 v3->simdInternal_[3] = t3.simdInternal_[3];
575 /*! \brief a==b for SIMD4 float
579 * \return Each element of the boolean will be set to true if a==b.
581 static inline Simd4FBool gmx_simdcall operator==(Simd4Float a, Simd4Float b)
585 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
587 res.simdInternal_[i] = (a.simdInternal_[i] == b.simdInternal_[i]);
592 /*! \brief a!=b for SIMD4 float
596 * \return Each element of the boolean will be set to true if a!=b.
598 static inline Simd4FBool gmx_simdcall operator!=(Simd4Float a, Simd4Float b)
602 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
604 res.simdInternal_[i] = (a.simdInternal_[i] != b.simdInternal_[i]);
609 /*! \brief a<b for SIMD4 float
613 * \return Each element of the boolean will be set to true if a<b.
615 static inline Simd4FBool gmx_simdcall operator<(Simd4Float a, Simd4Float b)
619 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
621 res.simdInternal_[i] = (a.simdInternal_[i] < b.simdInternal_[i]);
627 /*! \brief a<=b for SIMD4 float.
631 * \return Each element of the boolean will be set to true if a<=b.
633 static inline Simd4FBool gmx_simdcall operator<=(Simd4Float a, Simd4Float b)
637 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
639 res.simdInternal_[i] = (a.simdInternal_[i] <= b.simdInternal_[i]);
644 /*! \brief Logical \a and on single precision SIMD4 booleans.
646 * \param a logical vars 1
647 * \param b logical vars 2
648 * \return For each element, the result boolean is true if a \& b are true.
650 * \note This is not necessarily a bitwise operation - the storage format
651 * of booleans is implementation-dependent.
653 static inline Simd4FBool gmx_simdcall operator&&(Simd4FBool a, Simd4FBool b)
657 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
659 res.simdInternal_[i] = (a.simdInternal_[i] && b.simdInternal_[i]);
664 /*! \brief Logical \a or on single precision SIMD4 booleans.
666 * \param a logical vars 1
667 * \param b logical vars 2
668 * \return For each element, the result boolean is true if a or b is true.
670 * Note that this is not necessarily a bitwise operation - the storage format
671 * of booleans is implementation-dependent.
673 static inline Simd4FBool gmx_simdcall operator||(Simd4FBool a, Simd4FBool b)
677 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
679 res.simdInternal_[i] = (a.simdInternal_[i] || b.simdInternal_[i]);
684 /*! \brief Returns non-zero if any of the boolean in SIMD4 a is True, otherwise 0.
686 * \param a Logical variable.
687 * \return true if any element in a is true, otherwise false.
689 * The actual return value for truth will depend on the architecture,
690 * so any non-zero value is considered truth.
692 static inline bool gmx_simdcall anyTrue(Simd4FBool a)
696 for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
698 res = res || a.simdInternal_[i];
703 /*! \brief Select from single precision SIMD4 variable where boolean is true.
705 * \param a Floating-point variable to select from
706 * \param mask Boolean selector
707 * \return For each element, a is selected for true, 0 for false.
709 static inline Simd4Float gmx_simdcall selectByMask(Simd4Float a, Simd4FBool mask)
713 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
715 res.simdInternal_[i] = mask.simdInternal_[i] ? a.simdInternal_[i] : 0.0F;
720 /*! \brief Select from single precision SIMD4 variable where boolean is false.
722 * \param a Floating-point variable to select from
723 * \param mask Boolean selector
724 * \return For each element, a is selected for false, 0 for true (sic).
726 static inline Simd4Float gmx_simdcall selectByNotMask(Simd4Float a, Simd4FBool mask)
730 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
732 res.simdInternal_[i] = mask.simdInternal_[i] ? 0.0F : a.simdInternal_[i];
738 /*! \brief Vector-blend SIMD4 selection.
740 * \param a First source
741 * \param b Second source
742 * \param sel Boolean selector
743 * \return For each element, select b if sel is true, a otherwise.
745 static inline Simd4Float gmx_simdcall blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
749 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
751 res.simdInternal_[i] = sel.simdInternal_[i] ? b.simdInternal_[i] : a.simdInternal_[i];
757 /*! \brief Return sum of all elements in SIMD4 float variable.
759 * \param a SIMD4 variable to reduce/sum.
760 * \return The sum of all elements in the argument variable.
763 static inline float gmx_simdcall reduce(Simd4Float a)
767 for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
769 sum += a.simdInternal_[i];
781 #endif // GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H