2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H
36 #define GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H
53 Simd4Float(float f) : simdInternal_(vdupq_n_f32(f)) {}
55 // Internal utility constructor to simplify return statements
56 Simd4Float(float32x4_t simd) : simdInternal_(simd) {}
58 float32x4_t simdInternal_;
66 //! \brief Construct from scalar bool
67 Simd4FBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {}
69 // Internal utility constructor to simplify return statements
70 Simd4FBool(uint32x4_t simd) : simdInternal_(simd) {}
72 uint32x4_t simdInternal_;
75 static inline Simd4Float gmx_simdcall
78 assert(size_t(m) % 16 == 0);
84 static inline void gmx_simdcall
85 store4(float *m, Simd4Float a)
87 assert(size_t(m) % 16 == 0);
88 vst1q_f32(m, a.simdInternal_);
91 static inline Simd4Float gmx_simdcall
92 load4U(const float *m)
99 static inline void gmx_simdcall
100 store4U(float *m, Simd4Float a)
102 vst1q_f32(m, a.simdInternal_);
105 static inline Simd4Float gmx_simdcall
113 static inline Simd4Float gmx_simdcall
114 operator&(Simd4Float a, Simd4Float b)
117 vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a.simdInternal_),
118 vreinterpretq_s32_f32(b.simdInternal_)))
122 static inline Simd4Float gmx_simdcall
123 andNot(Simd4Float a, Simd4Float b)
126 vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b.simdInternal_),
127 vreinterpretq_s32_f32(a.simdInternal_)))
131 static inline Simd4Float gmx_simdcall
132 operator|(Simd4Float a, Simd4Float b)
135 vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a.simdInternal_),
136 vreinterpretq_s32_f32(b.simdInternal_)))
140 static inline Simd4Float gmx_simdcall
141 operator^(Simd4Float a, Simd4Float b)
144 vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a.simdInternal_),
145 vreinterpretq_s32_f32(b.simdInternal_)))
149 static inline Simd4Float gmx_simdcall
150 operator+(Simd4Float a, Simd4Float b)
153 vaddq_f32(a.simdInternal_, b.simdInternal_)
157 static inline Simd4Float gmx_simdcall
158 operator-(Simd4Float a, Simd4Float b)
161 vsubq_f32(a.simdInternal_, b.simdInternal_)
165 static inline Simd4Float gmx_simdcall
166 operator-(Simd4Float x)
169 vnegq_f32(x.simdInternal_)
173 static inline Simd4Float gmx_simdcall
174 operator*(Simd4Float a, Simd4Float b)
177 vmulq_f32(a.simdInternal_, b.simdInternal_)
181 // Override for Neon-Asimd
182 #if GMX_SIMD_ARM_NEON
183 static inline Simd4Float gmx_simdcall
184 fma(Simd4Float a, Simd4Float b, Simd4Float c)
187 #ifdef __ARM_FEATURE_FMA
188 vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
190 vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
195 static inline Simd4Float gmx_simdcall
196 fms(Simd4Float a, Simd4Float b, Simd4Float c)
199 #ifdef __ARM_FEATURE_FMA
200 vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
202 vnegq_f32(vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
207 static inline Simd4Float gmx_simdcall
208 fnma(Simd4Float a, Simd4Float b, Simd4Float c)
211 #ifdef __ARM_FEATURE_FMA
212 vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
214 vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
219 static inline Simd4Float gmx_simdcall
220 fnms(Simd4Float a, Simd4Float b, Simd4Float c)
223 #ifdef __ARM_FEATURE_FMA
224 vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
226 vnegq_f32(vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
232 static inline Simd4Float gmx_simdcall
236 vrsqrteq_f32(x.simdInternal_)
240 static inline Simd4Float gmx_simdcall
244 vabsq_f32( x.simdInternal_ )
248 static inline Simd4Float gmx_simdcall
249 max(Simd4Float a, Simd4Float b)
252 vmaxq_f32(a.simdInternal_, b.simdInternal_)
256 static inline Simd4Float gmx_simdcall
257 min(Simd4Float a, Simd4Float b)
260 vminq_f32(a.simdInternal_, b.simdInternal_)
264 // Override for Neon-Asimd
265 #if GMX_SIMD_ARM_NEON
266 static inline Simd4Float gmx_simdcall
269 // Convert x to nearest integer
270 float32x4_t signBitOfX = vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(x.simdInternal_)));
271 float32x4_t half = vdupq_n_f32(0.5F);
272 float32x4_t corr = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half), vreinterpretq_u32_f32(signBitOfX)));
274 int32x4_t integerX = vcvtq_s32_f32(vaddq_f32(x.simdInternal_, corr));
276 // Convert back to float
279 vcvtq_f32_s32(integerX)
283 static inline Simd4Float gmx_simdcall
287 vcvtq_f32_s32( vcvtq_s32_f32(x.simdInternal_) )
292 static inline void gmx_simdcall
293 transpose(Simd4Float * v0, Simd4Float * v1,
294 Simd4Float * v2, Simd4Float * v3)
296 float32x4x2_t t0 = vuzpq_f32(v0->simdInternal_, v2->simdInternal_);
297 float32x4x2_t t1 = vuzpq_f32(v1->simdInternal_, v3->simdInternal_);
298 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
299 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
300 v0->simdInternal_ = t2.val[0];
301 v1->simdInternal_ = t3.val[0];
302 v2->simdInternal_ = t2.val[1];
303 v3->simdInternal_ = t3.val[1];
306 static inline Simd4FBool gmx_simdcall
307 operator==(Simd4Float a, Simd4Float b)
310 vceqq_f32(a.simdInternal_, b.simdInternal_)
314 static inline Simd4FBool gmx_simdcall
315 operator!=(Simd4Float a, Simd4Float b)
318 vmvnq_u32(vceqq_f32(a.simdInternal_, b.simdInternal_))
322 static inline Simd4FBool gmx_simdcall
323 operator<(Simd4Float a, Simd4Float b)
326 vcltq_f32(a.simdInternal_, b.simdInternal_)
330 static inline Simd4FBool gmx_simdcall
331 operator<=(Simd4Float a, Simd4Float b)
334 vcleq_f32(a.simdInternal_, b.simdInternal_)
338 static inline Simd4FBool gmx_simdcall
339 operator&&(Simd4FBool a, Simd4FBool b)
342 vandq_u32(a.simdInternal_, b.simdInternal_)
346 static inline Simd4FBool gmx_simdcall
347 operator||(Simd4FBool a, Simd4FBool b)
350 vorrq_u32(a.simdInternal_, b.simdInternal_)
354 // Override for Neon-Asimd
355 #if GMX_SIMD_ARM_NEON
356 static inline bool gmx_simdcall
357 anyTrue(Simd4FBool a)
359 uint32x4_t x = a.simdInternal_;
360 uint32x4_t y = vextq_u32(x, x, 2);
363 y = vextq_u32(x, x, 1);
365 return (vgetq_lane_u32(x, 0) != 0);
369 static inline Simd4Float gmx_simdcall
370 selectByMask(Simd4Float a, Simd4FBool m)
373 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.simdInternal_),
378 static inline Simd4Float gmx_simdcall
379 selectByNotMask(Simd4Float a, Simd4FBool m)
382 vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.simdInternal_),
387 static inline Simd4Float gmx_simdcall
388 blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
391 vbslq_f32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
395 // Override for Neon-Asimd
396 #if GMX_SIMD_ARM_NEON
397 static inline float gmx_simdcall
400 float32x4_t x = a.simdInternal_;
401 float32x4_t y = vextq_f32(x, x, 2);
404 y = vextq_f32(x, x, 1);
406 return vgetq_lane_f32(x, 0);
409 static inline float gmx_simdcall
410 dotProduct(Simd4Float a, Simd4Float b)
415 /* set 4th element to 0, then add all of them */
416 c.simdInternal_ = vsetq_lane_f32(0.0F, c.simdInternal_, 3);
423 #endif // GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H