2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD_DOUBLE_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD_DOUBLE_H
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
53 // gcc-4.9 does not recognize that we use the parameter
54 SimdDouble(double gmx_unused d) : simdInternal_(vec_splats(d)) {}
56 // Internal utility constructor to simplify return statements
57 SimdDouble(__vector double simd) : simdInternal_(simd) {}
59 __vector double simdInternal_;
67 // gcc-4.9 does not recognize that we use the parameter
68 SimdDInt32(std::int32_t gmx_unused i) : simdInternal_(vec_splats(i)) {}
70 // Internal utility constructor to simplify return statements
71 SimdDInt32(__vector signed int simd) : simdInternal_(simd) {}
73 __vector signed int simdInternal_;
81 SimdDBool(bool b) : simdInternal_(reinterpret_cast<__vector vsxBool long long>(vec_splats( b ? 0xFFFFFFFFFFFFFFFFULL : 0))) {}
83 // Internal utility constructor to simplify return statements
84 SimdDBool(__vector vsxBool long long simd) : simdInternal_(simd) {}
86 __vector vsxBool long long simdInternal_;
94 SimdDIBool(bool b) : simdInternal_(reinterpret_cast<__vector vsxBool int>(vec_splats( b ? 0xFFFFFFFF : 0))) {}
96 // Internal utility constructor to simplify return statements
97 SimdDIBool(__vector vsxBool int simd) : simdInternal_(simd) {}
99 __vector vsxBool int simdInternal_;
102 // The VSX load & store operations are a bit of a mess. The interface is different
103 // for xlc version 12, xlc version 13, and gcc. Long-term IBM recommends
104 // simply using pointer dereferencing both for aligned and unaligned loads.
105 // That's nice, but unfortunately xlc still bugs out when the pointer is
106 // not aligned. Sticking to vec_xl/vec_xst isn't a solution either, since
107 // that appears to be buggy for some _aligned_ loads :-)
109 // For now, we use pointer dereferencing for all aligned load/stores, and
110 // for unaligned ones with gcc. On xlc we use vec_xlw4/vec_xstw4 for
111 // unaligned memory operations. The latest docs recommend using the overloaded
112 // vec_xl/vec_xst, but that is not supported on xlc version 12. We'll
113 // revisit things once xlc is a bit more stable - for now you probably want
114 // to stick to gcc...
116 static inline SimdDouble gmx_simdcall
117 simdLoad(const double *m)
120 *reinterpret_cast<const __vector double *>(m)
124 static inline void gmx_simdcall
125 store(double *m, SimdDouble a)
127 *reinterpret_cast<__vector double *>(m) = a.simdInternal_;
130 static inline SimdDouble gmx_simdcall
131 simdLoadU(const double *m)
133 #if defined(__ibmxl__) || defined(__xlC__)
135 vec_xlw4(0, const_cast<double *>(m))
139 *reinterpret_cast<const __vector double *>(m)
144 static inline void gmx_simdcall
145 storeU(double *m, SimdDouble a)
147 #if defined(__ibmxl__) || defined(__xlC__)
148 vec_xstw4(a.simdInternal_, 0, m);
150 *reinterpret_cast<__vector double *>(m) = a.simdInternal_;
154 static inline SimdDouble gmx_simdcall
162 static inline SimdDInt32 gmx_simdcall
163 simdLoadDI(const std::int32_t * m)
165 __vector signed int t0, t1;
166 const __vector unsigned char perm = { 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19, 16, 17, 18, 19 };
167 t0 = vec_splats(m[0]);
168 t1 = vec_splats(m[1]);
170 vec_perm(t0, t1, perm)
174 // gcc-4.9 does not understand that arguments to vec_extract() are used
175 static inline void gmx_simdcall
176 store(std::int32_t * m, SimdDInt32 gmx_unused x)
178 m[0] = vec_extract(x.simdInternal_, 0);
179 m[1] = vec_extract(x.simdInternal_, 2);
182 static inline SimdDInt32 gmx_simdcall
183 simdLoadUDI(const std::int32_t *m)
185 return simdLoadDI(m);
188 static inline void gmx_simdcall
189 storeU(std::int32_t * m, SimdDInt32 a)
194 static inline SimdDInt32 gmx_simdcall
198 vec_splats(static_cast<int>(0))
202 // gcc-4.9 does not detect that vec_extract() uses its argument
204 static inline std::int32_t gmx_simdcall
205 extract(SimdDInt32 gmx_unused a)
207 return vec_extract(a.simdInternal_, 2*index);
210 static inline SimdDouble gmx_simdcall
211 operator&(SimdDouble a, SimdDouble b)
214 vec_and(a.simdInternal_, b.simdInternal_)
218 static inline SimdDouble gmx_simdcall
219 andNot(SimdDouble a, SimdDouble b)
222 vec_andc(b.simdInternal_, a.simdInternal_)
226 static inline SimdDouble gmx_simdcall
227 operator|(SimdDouble a, SimdDouble b)
230 vec_or(a.simdInternal_, b.simdInternal_)
234 static inline SimdDouble gmx_simdcall
235 operator^(SimdDouble a, SimdDouble b)
238 vec_xor(a.simdInternal_, b.simdInternal_)
242 static inline SimdDouble gmx_simdcall
243 operator+(SimdDouble a, SimdDouble b)
246 vec_add(a.simdInternal_, b.simdInternal_)
250 static inline SimdDouble gmx_simdcall
251 operator-(SimdDouble a, SimdDouble b)
254 vec_sub(a.simdInternal_, b.simdInternal_)
258 static inline SimdDouble gmx_simdcall
259 operator-(SimdDouble x)
266 static inline SimdDouble gmx_simdcall
267 operator*(SimdDouble a, SimdDouble b)
270 vec_mul(a.simdInternal_, b.simdInternal_)
274 static inline SimdDouble gmx_simdcall
275 fma(SimdDouble a, SimdDouble b, SimdDouble c)
278 vec_madd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
282 static inline SimdDouble gmx_simdcall
283 fms(SimdDouble a, SimdDouble b, SimdDouble c)
286 vec_msub(a.simdInternal_, b.simdInternal_, c.simdInternal_)
290 static inline SimdDouble gmx_simdcall
291 fnma(SimdDouble a, SimdDouble b, SimdDouble c)
294 vec_nmsub(a.simdInternal_, b.simdInternal_, c.simdInternal_)
298 static inline SimdDouble gmx_simdcall
299 fnms(SimdDouble a, SimdDouble b, SimdDouble c)
302 vec_nmadd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
306 static inline SimdDouble gmx_simdcall
310 vec_rsqrte(x.simdInternal_)
314 static inline SimdDouble gmx_simdcall
318 vec_re(x.simdInternal_)
322 static inline SimdDouble gmx_simdcall
323 maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
326 vec_add(a.simdInternal_, vec_and(b.simdInternal_, reinterpret_cast<__vector double>(m.simdInternal_)))
330 static inline SimdDouble gmx_simdcall
331 maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
333 SimdDouble prod = a * b;
336 vec_and(prod.simdInternal_, reinterpret_cast<__vector double>(m.simdInternal_))
340 static inline SimdDouble gmx_simdcall
341 maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
343 SimdDouble prod = fma(a, b, c);
346 vec_and(prod.simdInternal_, reinterpret_cast<__vector double>(m.simdInternal_))
350 static inline SimdDouble gmx_simdcall
351 maskzRsqrt(SimdDouble x, SimdDBool m)
354 x.simdInternal_ = vec_sel(vec_splats(1.0f), x.simdInternal_, m.simdInternal_);
357 vec_and(vec_rsqrte(x.simdInternal_), reinterpret_cast<__vector double>(m.simdInternal_))
361 static inline SimdDouble gmx_simdcall
362 maskzRcp(SimdDouble x, SimdDBool m)
365 x.simdInternal_ = vec_sel(vec_splats(1.0f), x.simdInternal_, m.simdInternal_);
368 vec_and(vec_re(x.simdInternal_), reinterpret_cast<__vector double>(m.simdInternal_))
372 static inline SimdDouble gmx_simdcall
376 vec_abs( x.simdInternal_ )
380 static inline SimdDouble gmx_simdcall
381 max(SimdDouble a, SimdDouble b)
384 vec_max(a.simdInternal_, b.simdInternal_)
388 static inline SimdDouble gmx_simdcall
389 min(SimdDouble a, SimdDouble b)
392 vec_min(a.simdInternal_, b.simdInternal_)
396 static inline SimdDouble gmx_simdcall
399 #if defined(__GNUC__) && !defined(__ibmxl__) && !defined(__xlC__)
400 // gcc up to at least version 4.9 does not have vec_round() in double precision - use inline asm
402 __asm__ ("xvrdpi %x0,%x1" : "=wd" (res) : "wd" (x.simdInternal_));
408 vec_round( x.simdInternal_ )
413 static inline SimdDouble gmx_simdcall
417 vec_trunc( x.simdInternal_ )
421 static inline SimdDouble
422 frexp(SimdDouble value, SimdDInt32 * exponent)
424 // Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds
425 const __vector double exponentMask = reinterpret_cast<__vector double>(vec_splats(0x7FF0000000000000ULL));
426 const __vector signed int exponentBias = vec_splats(1022);
427 const __vector double half = vec_splats(0.5);
428 __vector signed int iExponent;
430 iExponent = reinterpret_cast<__vector signed int>(vec_and(value.simdInternal_, exponentMask));
431 // The data is in the upper half of each double (corresponding to elements 1 and 3).
432 // First shift 52-32=20bits, and then permute to swap element 0 with 1 and element 2 with 3
433 // For big endian they are in opposite order, so then we simply skip the swap.
434 iExponent = vec_sr(iExponent, vec_splats(20U));
435 #ifndef __BIG_ENDIAN__
436 const __vector unsigned char perm = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
437 iExponent = vec_perm(iExponent, iExponent, perm);
439 iExponent = vec_sub(iExponent, exponentBias);
440 exponent->simdInternal_ = iExponent;
443 vec_or(vec_andc(value.simdInternal_, exponentMask), half)
447 static inline SimdDouble
448 ldexp(SimdDouble value, SimdDInt32 exponent)
450 const __vector signed int exponentBias = vec_splats(1023);
451 __vector signed int iExponent;
452 #ifdef __BIG_ENDIAN__
453 const __vector unsigned char perm = {0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 16, 17, 18, 19};
455 const __vector unsigned char perm = {16, 17, 18, 19, 0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11};
458 iExponent = vec_add(exponent.simdInternal_, exponentBias);
459 // exponent is now present in pairs of integers; 0011.
460 // Elements 0/2 already correspond to the upper half of each double,
461 // so we only need to shift by another 52-32=20 bits.
462 // The remaining elements are set to zero.
463 iExponent = vec_sl(iExponent, vec_splats(20U));
464 iExponent = vec_perm(iExponent, vec_splats(0), perm);
467 vec_mul(value.simdInternal_, reinterpret_cast<__vector double>(iExponent))
471 static inline double gmx_simdcall
474 const __vector unsigned char perm = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
476 /* old xlc version 12 does not understand vec_perm() with double arguments */
477 x.simdInternal_ = vec_add(x.simdInternal_,
478 reinterpret_cast<__vector double>(vec_perm(reinterpret_cast<__vector signed int>(x.simdInternal_),
479 reinterpret_cast<__vector signed int>(x.simdInternal_), perm)));
481 x.simdInternal_ = vec_add(x.simdInternal_, vec_perm(x.simdInternal_, x.simdInternal_, perm));
483 return vec_extract(x.simdInternal_, 0);
486 static inline SimdDBool gmx_simdcall
487 operator==(SimdDouble a, SimdDouble b)
490 vec_cmpeq(a.simdInternal_, b.simdInternal_)
494 static inline SimdDBool gmx_simdcall
495 operator!=(SimdDouble a, SimdDouble b)
498 reinterpret_cast<__vector vsxBool long long>(vec_or(reinterpret_cast<__vector signed int>(vec_cmpgt(a.simdInternal_, b.simdInternal_)),
499 reinterpret_cast<__vector signed int>(vec_cmplt(a.simdInternal_, b.simdInternal_))))
503 static inline SimdDBool gmx_simdcall
504 operator<(SimdDouble a, SimdDouble b)
507 vec_cmplt(a.simdInternal_, b.simdInternal_)
511 static inline SimdDBool gmx_simdcall
512 operator<=(SimdDouble a, SimdDouble b)
515 vec_cmple(a.simdInternal_, b.simdInternal_)
519 static inline SimdDBool gmx_simdcall
520 testBits(SimdDouble a)
522 #ifdef __POWER8_VECTOR__
524 vec_cmpgt(reinterpret_cast<__vector unsigned long long>(a.simdInternal_), vec_splats(0ULL))
528 reinterpret_cast<__vector vsxBool long long>(vec_nor(reinterpret_cast<__vector signed int>(vec_cmpeq(a.simdInternal_, vec_splats(0.0))), vec_splats(0)))
533 static inline SimdDBool gmx_simdcall
534 operator&&(SimdDBool a, SimdDBool b)
537 reinterpret_cast<__vector vsxBool long long>(vec_and(reinterpret_cast<__vector signed int>(a.simdInternal_), reinterpret_cast<__vector signed int>(b.simdInternal_)))
541 static inline SimdDBool gmx_simdcall
542 operator||(SimdDBool a, SimdDBool b)
545 reinterpret_cast<__vector vsxBool long long>(vec_or(reinterpret_cast<__vector signed int>(a.simdInternal_), reinterpret_cast<__vector signed int>(b.simdInternal_)))
549 static inline bool gmx_simdcall
552 return vec_any_ne(reinterpret_cast<__vector vsxBool int>(a.simdInternal_), reinterpret_cast<__vector vsxBool int>(vec_splats(0)));
555 static inline SimdDouble gmx_simdcall
556 selectByMask(SimdDouble a, SimdDBool m)
559 vec_and(a.simdInternal_, reinterpret_cast<__vector double>(m.simdInternal_))
563 static inline SimdDouble gmx_simdcall
564 selectByNotMask(SimdDouble a, SimdDBool m)
567 vec_andc(a.simdInternal_, reinterpret_cast<__vector double>(m.simdInternal_))
571 static inline SimdDouble gmx_simdcall
572 blend(SimdDouble a, SimdDouble b, SimdDBool sel)
575 vec_sel(a.simdInternal_, b.simdInternal_, sel.simdInternal_)
579 static inline SimdDInt32 gmx_simdcall
580 operator<<(SimdDInt32 a, int n)
583 vec_sl(a.simdInternal_, vec_splats(static_cast<unsigned int>(n)))
587 static inline SimdDInt32 gmx_simdcall
588 operator>>(SimdDInt32 a, int n)
591 vec_sr(a.simdInternal_, vec_splats(static_cast<unsigned int>(n)))
595 static inline SimdDInt32 gmx_simdcall
596 operator&(SimdDInt32 a, SimdDInt32 b)
599 vec_and(a.simdInternal_, b.simdInternal_)
603 static inline SimdDInt32 gmx_simdcall
604 andNot(SimdDInt32 a, SimdDInt32 b)
607 vec_andc(b.simdInternal_, a.simdInternal_)
611 static inline SimdDInt32 gmx_simdcall
612 operator|(SimdDInt32 a, SimdDInt32 b)
615 vec_or(a.simdInternal_, b.simdInternal_)
619 static inline SimdDInt32 gmx_simdcall
620 operator^(SimdDInt32 a, SimdDInt32 b)
623 vec_xor(a.simdInternal_, b.simdInternal_)
627 static inline SimdDInt32 gmx_simdcall
628 operator+(SimdDInt32 a, SimdDInt32 b)
631 vec_add(a.simdInternal_, b.simdInternal_)
635 static inline SimdDInt32 gmx_simdcall
636 operator-(SimdDInt32 a, SimdDInt32 b)
639 vec_sub(a.simdInternal_, b.simdInternal_)
643 static inline SimdDInt32 gmx_simdcall
644 operator*(SimdDInt32 a, SimdDInt32 b)
647 a.simdInternal_ * b.simdInternal_
651 static inline SimdDIBool gmx_simdcall
652 operator==(SimdDInt32 a, SimdDInt32 b)
655 vec_cmpeq(a.simdInternal_, b.simdInternal_)
659 static inline SimdDIBool gmx_simdcall
660 testBits(SimdDInt32 a)
663 vec_cmpgt( reinterpret_cast<__vector unsigned int>(a.simdInternal_), vec_splats(0U))
667 static inline SimdDIBool gmx_simdcall
668 operator<(SimdDInt32 a, SimdDInt32 b)
671 vec_cmplt(a.simdInternal_, b.simdInternal_)
675 static inline SimdDIBool gmx_simdcall
676 operator&&(SimdDIBool a, SimdDIBool b)
679 vec_and(a.simdInternal_, b.simdInternal_)
683 static inline SimdDIBool gmx_simdcall
684 operator||(SimdDIBool a, SimdDIBool b)
687 vec_or(a.simdInternal_, b.simdInternal_)
691 static inline bool gmx_simdcall
692 anyTrue(SimdDIBool a)
694 return vec_any_ne(a.simdInternal_, reinterpret_cast<__vector vsxBool int>(vec_splats(0)));
697 static inline SimdDInt32 gmx_simdcall
698 selectByMask(SimdDInt32 a, SimdDIBool m)
701 vec_and(a.simdInternal_, reinterpret_cast<__vector signed int>(m.simdInternal_))
705 static inline SimdDInt32 gmx_simdcall
706 selectByNotMask(SimdDInt32 a, SimdDIBool m)
709 vec_andc(a.simdInternal_, reinterpret_cast<__vector signed int>(m.simdInternal_))
713 static inline SimdDInt32 gmx_simdcall
714 blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
717 vec_sel(a.simdInternal_, b.simdInternal_, sel.simdInternal_)
721 static inline SimdDInt32 gmx_simdcall
722 cvttR2I(SimdDouble a)
724 #if defined(__GNUC__) && !defined(__ibmxl__) && !defined(__xlC__)
725 // gcc up to at least version 4.9 is missing intrinsics for converting double to/from int - use inline asm
726 const __vector unsigned char perm = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
729 __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (ix) : "wd" (a.simdInternal_));
732 reinterpret_cast<__vector signed int>(vec_perm(ix, ix, perm))
736 vec_cts(a.simdInternal_, 0)
741 static inline SimdDInt32 gmx_simdcall
744 return cvttR2I(round(a));
747 static inline SimdDouble gmx_simdcall
750 #if defined(__GNUC__) && !defined(__ibmxl__) && !defined(__xlC__)
751 // gcc up to at least version 4.9 is missing intrinsics for converting double to/from int - use inline asm
752 const __vector unsigned char perm = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
755 a.simdInternal_ = vec_perm(a.simdInternal_, a.simdInternal_, perm);
756 __asm__ ("xvcvsxwdp %x0,%x1" : "=wd" (x) : "wa" (a.simdInternal_));
763 vec_ctd(a.simdInternal_, 0)
768 static inline SimdDIBool gmx_simdcall
772 reinterpret_cast<__vector vsxBool int>(a.simdInternal_)
776 static inline SimdDBool gmx_simdcall
777 cvtIB2B(SimdDIBool a)
780 reinterpret_cast<__vector vsxBool long long>(a.simdInternal_)
784 static inline void gmx_simdcall
785 cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
787 __vector float fA, fB;
788 fA = vec_mergeh(f.simdInternal_, f.simdInternal_); /* 0011 */
789 fB = vec_mergel(f.simdInternal_, f.simdInternal_); /* 2233 */
790 #if defined(__GNUC__) && !defined(__ibmxl__) && !defined(__xlC__)
791 // gcc-4.9 is missing double-to-float/float-to-double conversions.
792 __asm__ ("xvcvspdp %x0,%x1" : "=wd" (d0->simdInternal_) : "wf" (fA));
793 __asm__ ("xvcvspdp %x0,%x1" : "=wd" (d1->simdInternal_) : "wf" (fB));
795 d0->simdInternal_ = vec_cvf(fA); /* 01 */
796 d1->simdInternal_ = vec_cvf(fB); /* 23 */
800 static inline SimdFloat gmx_simdcall
801 cvtDD2F(SimdDouble d0, SimdDouble d1)
803 __vector float fA, fB, fC, fD, fE;
804 #if defined(__GNUC__) && !defined(__ibmxl__) && !defined(__xlC__)
805 // gcc-4.9 is missing double-to-float/float-to-double conversions.
806 __asm__ ("xvcvdpsp %x0,%x1" : "=wf" (fA) : "wd" (d0.simdInternal_));
807 __asm__ ("xvcvdpsp %x0,%x1" : "=wf" (fB) : "wd" (d1.simdInternal_));
809 fA = vec_cvf(d0.simdInternal_); /* 0x1x */
810 fB = vec_cvf(d1.simdInternal_); /* 2x3x */
812 fC = vec_mergeh(fA, fB); /* 02xx */
813 fD = vec_mergel(fA, fB); /* 13xx */
814 fE = vec_mergeh(fC, fD); /* 0123 */
820 static inline SimdDouble gmx_simdcall
821 copysign(SimdDouble x, SimdDouble y)
823 #if defined(__GNUC__) && !defined(__ibmxl__) && !defined(__xlC__)
825 __asm__ ("xvcpsgndp %x0,%x1,%x2" : "=wd" (res) : "wd" (y.simdInternal_), "wd" (x.simdInternal_));
831 vec_cpsgn(y.simdInternal_, x.simdInternal_)
838 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD_DOUBLE_H