2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
45 #include "impl_arm_neon_asimd_simd_float.h"
55 SimdDouble(double d) : simdInternal_(vdupq_n_f64(d)) {}
57 // Internal utility constructor to simplify return statements
58 SimdDouble(float64x2_t simd) : simdInternal_(simd) {}
60 float64x2_t simdInternal_;
68 SimdDInt32(std::int32_t i) : simdInternal_(vdup_n_s32(i)) {}
70 // Internal utility constructor to simplify return statements
71 SimdDInt32(int32x2_t simd) : simdInternal_(simd) {}
73 int32x2_t simdInternal_;
81 SimdDBool(bool b) : simdInternal_(vdupq_n_u64( b ? 0xFFFFFFFFFFFFFFFF : 0)) {}
83 // Internal utility constructor to simplify return statements
84 SimdDBool(uint64x2_t simd) : simdInternal_(simd) {}
86 uint64x2_t simdInternal_;
94 SimdDIBool(bool b) : simdInternal_(vdup_n_u32( b ? 0xFFFFFFFF : 0)) {}
96 // Internal utility constructor to simplify return statements
97 SimdDIBool(uint32x2_t simd) : simdInternal_(simd) {}
99 uint32x2_t simdInternal_;
102 static inline SimdDouble gmx_simdcall
103 simdLoad(const double *m)
105 assert(std::size_t(m) % 16 == 0);
111 static inline void gmx_simdcall
112 store(double *m, SimdDouble a)
114 assert(std::size_t(m) % 16 == 0);
115 vst1q_f64(m, a.simdInternal_);
118 static inline SimdDouble gmx_simdcall
119 simdLoadU(const double *m)
126 static inline void gmx_simdcall
127 storeU(double *m, SimdDouble a)
129 vst1q_f64(m, a.simdInternal_);
132 static inline SimdDouble gmx_simdcall
140 static inline SimdDInt32 gmx_simdcall
141 simdLoadDI(const std::int32_t * m)
143 assert(std::size_t(m) % 8 == 0);
149 static inline void gmx_simdcall
150 store(std::int32_t * m, SimdDInt32 a)
152 assert(std::size_t(m) % 8 == 0);
153 vst1_s32(m, a.simdInternal_);
156 static inline SimdDInt32 gmx_simdcall
157 simdLoadUDI(const std::int32_t *m)
164 static inline void gmx_simdcall
165 storeU(std::int32_t * m, SimdDInt32 a)
167 vst1_s32(m, a.simdInternal_);
170 static inline SimdDInt32 gmx_simdcall
178 template<int index> gmx_simdcall
179 static inline std::int32_t
180 extract(SimdDInt32 a)
182 return vget_lane_s32(a.simdInternal_, index);
185 static inline SimdDouble gmx_simdcall
186 operator&(SimdDouble a, SimdDouble b)
189 float64x2_t(vandq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_)))
193 static inline SimdDouble gmx_simdcall
194 andNot(SimdDouble a, SimdDouble b)
197 float64x2_t(vbicq_s64(int64x2_t(b.simdInternal_), int64x2_t(a.simdInternal_)))
201 static inline SimdDouble gmx_simdcall
202 operator|(SimdDouble a, SimdDouble b)
205 float64x2_t(vorrq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_)))
209 static inline SimdDouble gmx_simdcall
210 operator^(SimdDouble a, SimdDouble b)
213 float64x2_t(veorq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_)))
217 static inline SimdDouble gmx_simdcall
218 operator+(SimdDouble a, SimdDouble b)
221 vaddq_f64(a.simdInternal_, b.simdInternal_)
225 static inline SimdDouble gmx_simdcall
226 operator-(SimdDouble a, SimdDouble b)
229 vsubq_f64(a.simdInternal_, b.simdInternal_)
233 static inline SimdDouble gmx_simdcall
234 operator-(SimdDouble x)
237 vnegq_f64(x.simdInternal_)
241 static inline SimdDouble gmx_simdcall
242 operator*(SimdDouble a, SimdDouble b)
245 vmulq_f64(a.simdInternal_, b.simdInternal_)
249 static inline SimdDouble gmx_simdcall
250 fma(SimdDouble a, SimdDouble b, SimdDouble c)
253 vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)
257 static inline SimdDouble gmx_simdcall
258 fms(SimdDouble a, SimdDouble b, SimdDouble c)
261 vnegq_f64(vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_))
265 static inline SimdDouble gmx_simdcall
266 fnma(SimdDouble a, SimdDouble b, SimdDouble c)
269 vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)
273 static inline SimdDouble gmx_simdcall
274 fnms(SimdDouble a, SimdDouble b, SimdDouble c)
277 vnegq_f64(vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_))
281 static inline SimdDouble gmx_simdcall
285 vrsqrteq_f64(x.simdInternal_)
289 static inline SimdDouble gmx_simdcall
290 rsqrtIter(SimdDouble lu, SimdDouble x)
293 vmulq_f64(lu.simdInternal_, vrsqrtsq_f64(vmulq_f64(lu.simdInternal_, lu.simdInternal_), x.simdInternal_))
297 static inline SimdDouble gmx_simdcall
301 vrecpeq_f64(x.simdInternal_)
305 static inline SimdDouble gmx_simdcall
306 rcpIter(SimdDouble lu, SimdDouble x)
309 vmulq_f64(lu.simdInternal_, vrecpsq_f64(lu.simdInternal_, x.simdInternal_))
313 static inline SimdDouble gmx_simdcall
314 maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
316 float64x2_t addend = float64x2_t(vandq_u64(uint64x2_t(b.simdInternal_), m.simdInternal_));
319 vaddq_f64(a.simdInternal_, addend)
323 static inline SimdDouble gmx_simdcall
324 maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
326 float64x2_t prod = vmulq_f64(a.simdInternal_, b.simdInternal_);
328 float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_))
332 static inline SimdDouble gmx_simdcall
333 maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
335 float64x2_t prod = vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_);
338 float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_))
342 static inline SimdDouble gmx_simdcall
343 maskzRsqrt(SimdDouble x, SimdDBool m)
345 // The result will always be correct since we mask the result with m, but
346 // for debug builds we also want to make sure not to generate FP exceptions
348 x.simdInternal_ = vbslq_f64(m.simdInternal_, x.simdInternal_, vdupq_n_f64(1.0f));
351 float64x2_t(vandq_u64(uint64x2_t(vrsqrteq_f64(x.simdInternal_)), m.simdInternal_))
355 static inline SimdDouble gmx_simdcall
356 maskzRcp(SimdDouble x, SimdDBool m)
358 // The result will always be correct since we mask the result with m, but
359 // for debug builds we also want to make sure not to generate FP exceptions
361 x.simdInternal_ = vbslq_f64(m.simdInternal_, x.simdInternal_, vdupq_n_f64(1.0f));
364 float64x2_t(vandq_u64(uint64x2_t(vrecpeq_f64(x.simdInternal_)), m.simdInternal_))
368 static inline SimdDouble gmx_simdcall
372 vabsq_f64( x.simdInternal_ )
376 static inline SimdDouble gmx_simdcall
377 max(SimdDouble a, SimdDouble b)
380 vmaxq_f64(a.simdInternal_, b.simdInternal_)
384 static inline SimdDouble gmx_simdcall
385 min(SimdDouble a, SimdDouble b)
388 vminq_f64(a.simdInternal_, b.simdInternal_)
392 static inline SimdDouble gmx_simdcall
396 vrndnq_f64(x.simdInternal_)
400 static inline SimdDouble gmx_simdcall
404 vrndq_f64( x.simdInternal_ )
408 static inline SimdDouble
409 frexp(SimdDouble value, SimdDInt32 * exponent)
411 const float64x2_t exponentMask = float64x2_t( vdupq_n_s64(0x7FF0000000000000LL) );
412 const float64x2_t mantissaMask = float64x2_t( vdupq_n_s64(0x800FFFFFFFFFFFFFLL) );
414 const int64x2_t exponentBias = vdupq_n_s64(1022); // add 1 to make our definition identical to frexp()
415 const float64x2_t half = vdupq_n_f64(0.5);
418 iExponent = vandq_s64( int64x2_t(value.simdInternal_), int64x2_t(exponentMask) );
419 iExponent = vsubq_s64(vshrq_n_s64(iExponent, 52), exponentBias);
420 exponent->simdInternal_ = vmovn_s64(iExponent);
423 float64x2_t(vorrq_s64(vandq_s64(int64x2_t(value.simdInternal_), int64x2_t(mantissaMask)), int64x2_t(half)))
427 static inline SimdDouble
428 ldexp(SimdDouble value, SimdDInt32 exponent)
430 const int64x2_t exponentBias = vdupq_n_s64(1023);
433 iExponent = vmovl_s32(exponent.simdInternal_);
434 iExponent = vshlq_n_s64(vaddq_s64(iExponent, exponentBias), 52);
437 vmulq_f64(value.simdInternal_, float64x2_t(iExponent))
441 static inline double gmx_simdcall
444 float64x2_t b = vpaddq_f64(a.simdInternal_, a.simdInternal_);
445 return vgetq_lane_f64(b, 0);
448 static inline SimdDBool gmx_simdcall
449 operator==(SimdDouble a, SimdDouble b)
452 vceqq_f64(a.simdInternal_, b.simdInternal_)
456 static inline SimdDBool gmx_simdcall
457 operator!=(SimdDouble a, SimdDouble b)
460 vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.simdInternal_, b.simdInternal_))))
464 static inline SimdDBool gmx_simdcall
465 operator<(SimdDouble a, SimdDouble b)
468 vcltq_f64(a.simdInternal_, b.simdInternal_)
472 static inline SimdDBool gmx_simdcall
473 operator<=(SimdDouble a, SimdDouble b)
476 vcleq_f64(a.simdInternal_, b.simdInternal_)
480 static inline SimdDBool gmx_simdcall
481 testBits(SimdDouble a)
484 vtstq_s64( int64x2_t(a.simdInternal_), int64x2_t(a.simdInternal_) )
488 static inline SimdDBool gmx_simdcall
489 operator&&(SimdDBool a, SimdDBool b)
492 vandq_u64(a.simdInternal_, b.simdInternal_)
496 static inline SimdDBool gmx_simdcall
497 operator||(SimdDBool a, SimdDBool b)
500 vorrq_u64(a.simdInternal_, b.simdInternal_)
504 static inline bool gmx_simdcall
507 return (vmaxvq_u32((uint32x4_t)(a.simdInternal_)) != 0);
510 static inline SimdDouble gmx_simdcall
511 selectByMask(SimdDouble a, SimdDBool m)
514 float64x2_t(vandq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_))
518 static inline SimdDouble gmx_simdcall
519 selectByNotMask(SimdDouble a, SimdDBool m)
522 float64x2_t(vbicq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_))
526 static inline SimdDouble gmx_simdcall
527 blend(SimdDouble a, SimdDouble b, SimdDBool sel)
530 vbslq_f64(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
534 static inline SimdDInt32 gmx_simdcall
535 operator<<(SimdDInt32 a, int n)
538 vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? 32 : n))
542 static inline SimdDInt32 gmx_simdcall
543 operator>>(SimdDInt32 a, int n)
546 vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? -32 : -n))
550 static inline SimdDInt32 gmx_simdcall
551 operator&(SimdDInt32 a, SimdDInt32 b)
554 vand_s32(a.simdInternal_, b.simdInternal_)
558 static inline SimdDInt32 gmx_simdcall
559 andNot(SimdDInt32 a, SimdDInt32 b)
562 vbic_s32(b.simdInternal_, a.simdInternal_)
566 static inline SimdDInt32 gmx_simdcall
567 operator|(SimdDInt32 a, SimdDInt32 b)
570 vorr_s32(a.simdInternal_, b.simdInternal_)
574 static inline SimdDInt32 gmx_simdcall
575 operator^(SimdDInt32 a, SimdDInt32 b)
578 veor_s32(a.simdInternal_, b.simdInternal_)
582 static inline SimdDInt32 gmx_simdcall
583 operator+(SimdDInt32 a, SimdDInt32 b)
586 vadd_s32(a.simdInternal_, b.simdInternal_)
590 static inline SimdDInt32 gmx_simdcall
591 operator-(SimdDInt32 a, SimdDInt32 b)
594 vsub_s32(a.simdInternal_, b.simdInternal_)
598 static inline SimdDInt32 gmx_simdcall
599 operator*(SimdDInt32 a, SimdDInt32 b)
602 vmul_s32(a.simdInternal_, b.simdInternal_)
606 static inline SimdDIBool gmx_simdcall
607 operator==(SimdDInt32 a, SimdDInt32 b)
610 vceq_s32(a.simdInternal_, b.simdInternal_)
614 static inline SimdDIBool gmx_simdcall
615 testBits(SimdDInt32 a)
618 vtst_s32( a.simdInternal_, a.simdInternal_)
622 static inline SimdDIBool gmx_simdcall
623 operator<(SimdDInt32 a, SimdDInt32 b)
626 vclt_s32(a.simdInternal_, b.simdInternal_)
630 static inline SimdDIBool gmx_simdcall
631 operator&&(SimdDIBool a, SimdDIBool b)
634 vand_u32(a.simdInternal_, b.simdInternal_)
638 static inline SimdDIBool gmx_simdcall
639 operator||(SimdDIBool a, SimdDIBool b)
642 vorr_u32(a.simdInternal_, b.simdInternal_)
646 static inline bool gmx_simdcall
647 anyTrue(SimdDIBool a)
649 return (vmaxv_u32(a.simdInternal_) != 0);
652 static inline SimdDInt32 gmx_simdcall
653 selectByMask(SimdDInt32 a, SimdDIBool m)
656 vand_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_))
660 static inline SimdDInt32 gmx_simdcall
661 selectByNotMask(SimdDInt32 a, SimdDIBool m)
664 vbic_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_))
668 static inline SimdDInt32 gmx_simdcall
669 blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
672 vbsl_s32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
676 static inline SimdDInt32 gmx_simdcall
680 vmovn_s64(vcvtnq_s64_f64(a.simdInternal_))
684 static inline SimdDInt32 gmx_simdcall
685 cvttR2I(SimdDouble a)
688 vmovn_s64(vcvtq_s64_f64(a.simdInternal_))
692 static inline SimdDouble gmx_simdcall
696 vcvtq_f64_s64(vmovl_s32(a.simdInternal_))
700 static inline SimdDIBool gmx_simdcall
704 vqmovn_u64(a.simdInternal_)
708 static inline SimdDBool gmx_simdcall
709 cvtIB2B(SimdDIBool a)
712 vorrq_u64(vmovl_u32(a.simdInternal_), vshlq_n_u64(vmovl_u32(a.simdInternal_), 32))
716 static inline void gmx_simdcall
717 cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
719 d0->simdInternal_ = vcvt_f64_f32(vget_low_f32(f.simdInternal_));
720 d1->simdInternal_ = vcvt_high_f64_f32(f.simdInternal_);
723 static inline SimdFloat gmx_simdcall
724 cvtDD2F(SimdDouble d0, SimdDouble d1)
727 vcvt_high_f32_f64(vcvt_f32_f64(d0.simdInternal_), d1.simdInternal_)
733 #endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H