src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H
  36 #define GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H
  37
  38 #include "config.h"
  39
  40 #include <cassert>
  41 #include <cstddef>
  42
  43 #include <arm_neon.h>
  44
  45 namespace gmx
  46 {
  47
  48 class Simd4Float
  49 {
  50     public:
  51         Simd4Float() {}
  52
  53         Simd4Float(float f) : simdInternal_(vdupq_n_f32(f)) {}
  54
  55         // Internal utility constructor to simplify return statements
  56         Simd4Float(float32x4_t simd) : simdInternal_(simd) {}
  57
  58         float32x4_t  simdInternal_;
  59 };
  60
  61 class Simd4FBool
  62 {
  63     public:
  64         Simd4FBool() {}
  65
  66         //! \brief Construct from scalar bool
  67         Simd4FBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {}
  68
  69         // Internal utility constructor to simplify return statements
  70         Simd4FBool(uint32x4_t simd) : simdInternal_(simd) {}
  71
  72         uint32x4_t  simdInternal_;
  73 };
  74
  75 static inline Simd4Float gmx_simdcall
  76 load4(const float *m)
  77 {
  78     assert(size_t(m) % 16 == 0);
  79     return {
  80                vld1q_f32(m)
  81     };
  82 }
  83
  84 static inline void gmx_simdcall
  85 store4(float *m, Simd4Float a)
  86 {
  87     assert(size_t(m) % 16 == 0);
  88     vst1q_f32(m, a.simdInternal_);
  89 }
  90
  91 static inline Simd4Float gmx_simdcall
  92 load4U(const float *m)
  93 {
  94     return {
  95                vld1q_f32(m)
  96     };
  97 }
  98
  99 static inline void gmx_simdcall
 100 store4U(float *m, Simd4Float a)
 101 {
 102     vst1q_f32(m, a.simdInternal_);
 103 }
 104
 105 static inline Simd4Float gmx_simdcall
 106 simd4SetZeroF()
 107 {
 108     return {
 109                vdupq_n_f32(0.0F)
 110     };
 111 }
 112
 113 static inline Simd4Float gmx_simdcall
 114 operator&(Simd4Float a, Simd4Float b)
 115 {
 116     return {
 117                vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a.simdInternal_),
 118                                                vreinterpretq_s32_f32(b.simdInternal_)))
 119     };
 120 }
 121
 122 static inline Simd4Float gmx_simdcall
 123 andNot(Simd4Float a, Simd4Float b)
 124 {
 125     return {
 126                vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b.simdInternal_),
 127                                                vreinterpretq_s32_f32(a.simdInternal_)))
 128     };
 129 }
 130
 131 static inline Simd4Float gmx_simdcall
 132 operator|(Simd4Float a, Simd4Float b)
 133 {
 134     return {
 135                vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a.simdInternal_),
 136                                                vreinterpretq_s32_f32(b.simdInternal_)))
 137     };
 138 }
 139
 140 static inline Simd4Float gmx_simdcall
 141 operator^(Simd4Float a, Simd4Float b)
 142 {
 143     return {
 144                vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a.simdInternal_),
 145                                                vreinterpretq_s32_f32(b.simdInternal_)))
 146     };
 147 }
 148
 149 static inline Simd4Float gmx_simdcall
 150 operator+(Simd4Float a, Simd4Float b)
 151 {
 152     return {
 153                vaddq_f32(a.simdInternal_, b.simdInternal_)
 154     };
 155 }
 156
 157 static inline Simd4Float gmx_simdcall
 158 operator-(Simd4Float a, Simd4Float b)
 159 {
 160     return {
 161                vsubq_f32(a.simdInternal_, b.simdInternal_)
 162     };
 163 }
 164
 165 static inline Simd4Float gmx_simdcall
 166 operator-(Simd4Float x)
 167 {
 168     return {
 169                vnegq_f32(x.simdInternal_)
 170     };
 171 }
 172
 173 static inline Simd4Float gmx_simdcall
 174 operator*(Simd4Float a, Simd4Float b)
 175 {
 176     return {
 177                vmulq_f32(a.simdInternal_, b.simdInternal_)
 178     };
 179 }
 180
 181 // Override for Neon-Asimd
 182 #if GMX_SIMD_ARM_NEON
 183 static inline Simd4Float gmx_simdcall
 184 fma(Simd4Float a, Simd4Float b, Simd4Float c)
 185 {
 186     return {
 187 #ifdef __ARM_FEATURE_FMA
 188                vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 189 #else
 190                vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 191 #endif
 192     };
 193 }
 194
 195 static inline Simd4Float gmx_simdcall
 196 fms(Simd4Float a, Simd4Float b, Simd4Float c)
 197 {
 198     return {
 199 #ifdef __ARM_FEATURE_FMA
 200                vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 201 #else
 202                vnegq_f32(vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 203 #endif
 204     };
 205 }
 206
 207 static inline Simd4Float gmx_simdcall
 208 fnma(Simd4Float a, Simd4Float b, Simd4Float c)
 209 {
 210     return {
 211 #ifdef __ARM_FEATURE_FMA
 212                vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 213 #else
 214                vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
 215 #endif
 216     };
 217 }
 218
 219 static inline Simd4Float gmx_simdcall
 220 fnms(Simd4Float a, Simd4Float b, Simd4Float c)
 221 {
 222     return {
 223 #ifdef __ARM_FEATURE_FMA
 224                vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 225 #else
 226                vnegq_f32(vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
 227 #endif
 228     };
 229 }
 230 #endif
 231
 232 static inline Simd4Float gmx_simdcall
 233 rsqrt(Simd4Float x)
 234 {
 235     return {
 236                vrsqrteq_f32(x.simdInternal_)
 237     };
 238 }
 239
 240 static inline Simd4Float gmx_simdcall
 241 abs(Simd4Float x)
 242 {
 243     return {
 244                vabsq_f32( x.simdInternal_ )
 245     };
 246 }
 247
 248 static inline Simd4Float gmx_simdcall
 249 max(Simd4Float a, Simd4Float b)
 250 {
 251     return {
 252                vmaxq_f32(a.simdInternal_, b.simdInternal_)
 253     };
 254 }
 255
 256 static inline Simd4Float gmx_simdcall
 257 min(Simd4Float a, Simd4Float b)
 258 {
 259     return {
 260                vminq_f32(a.simdInternal_, b.simdInternal_)
 261     };
 262 }
 263
 264 // Override for Neon-Asimd
 265 #if GMX_SIMD_ARM_NEON
 266 static inline Simd4Float gmx_simdcall
 267 round(Simd4Float x)
 268 {
 269     // Convert x to nearest integer
 270     float32x4_t signBitOfX = vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(x.simdInternal_)));
 271     float32x4_t half       = vdupq_n_f32(0.5F);
 272     float32x4_t corr       = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half), vreinterpretq_u32_f32(signBitOfX)));
 273
 274     int32x4_t   integerX   = vcvtq_s32_f32(vaddq_f32(x.simdInternal_, corr));
 275
 276     // Convert back to float
 277
 278     return {
 279                vcvtq_f32_s32(integerX)
 280     };
 281 }
 282
 283 static inline Simd4Float gmx_simdcall
 284 trunc(Simd4Float x)
 285 {
 286     return {
 287                vcvtq_f32_s32( vcvtq_s32_f32(x.simdInternal_) )
 288     };
 289 }
 290 #endif
 291
 292 static inline void gmx_simdcall
 293 transpose(Simd4Float * v0, Simd4Float * v1,
 294           Simd4Float * v2, Simd4Float * v3)
 295 {
 296     float32x4x2_t  t0 = vuzpq_f32(v0->simdInternal_, v2->simdInternal_);
 297     float32x4x2_t  t1 = vuzpq_f32(v1->simdInternal_, v3->simdInternal_);
 298     float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 299     float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 300     v0->simdInternal_ = t2.val[0];
 301     v1->simdInternal_ = t3.val[0];
 302     v2->simdInternal_ = t2.val[1];
 303     v3->simdInternal_ = t3.val[1];
 304 }
 305
 306 static inline Simd4FBool gmx_simdcall
 307 operator==(Simd4Float a, Simd4Float b)
 308 {
 309     return {
 310                vceqq_f32(a.simdInternal_, b.simdInternal_)
 311     };
 312 }
 313
 314 static inline Simd4FBool gmx_simdcall
 315 operator!=(Simd4Float a, Simd4Float b)
 316 {
 317     return {
 318                vmvnq_u32(vceqq_f32(a.simdInternal_, b.simdInternal_))
 319     };
 320 }
 321
 322 static inline Simd4FBool gmx_simdcall
 323 operator<(Simd4Float a, Simd4Float b)
 324 {
 325     return {
 326                vcltq_f32(a.simdInternal_, b.simdInternal_)
 327     };
 328 }
 329
 330 static inline Simd4FBool gmx_simdcall
 331 operator<=(Simd4Float a, Simd4Float b)
 332 {
 333     return {
 334                vcleq_f32(a.simdInternal_, b.simdInternal_)
 335     };
 336 }
 337
 338 static inline Simd4FBool gmx_simdcall
 339 operator&&(Simd4FBool a, Simd4FBool b)
 340 {
 341     return {
 342                vandq_u32(a.simdInternal_, b.simdInternal_)
 343     };
 344 }
 345
 346 static inline Simd4FBool gmx_simdcall
 347 operator||(Simd4FBool a, Simd4FBool b)
 348 {
 349     return {
 350                vorrq_u32(a.simdInternal_, b.simdInternal_)
 351     };
 352 }
 353
 354 // Override for Neon-Asimd
 355 #if GMX_SIMD_ARM_NEON
 356 static inline bool gmx_simdcall
 357 anyTrue(Simd4FBool a)
 358 {
 359     uint32x4_t x = a.simdInternal_;
 360     uint32x4_t y = vextq_u32(x, x, 2);
 361
 362     x = vorrq_u32(x, y);
 363     y = vextq_u32(x, x, 1);
 364     x = vorrq_u32(x, y);
 365     return (vgetq_lane_u32(x, 0) != 0);
 366 }
 367 #endif
 368
 369 static inline Simd4Float gmx_simdcall
 370 selectByMask(Simd4Float a, Simd4FBool m)
 371 {
 372     return {
 373                vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.simdInternal_),
 374                                                m.simdInternal_))
 375     };
 376 }
 377
 378 static inline Simd4Float gmx_simdcall
 379 selectByNotMask(Simd4Float a, Simd4FBool m)
 380 {
 381     return {
 382                vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.simdInternal_),
 383                                                m.simdInternal_))
 384     };
 385 }
 386
 387 static inline Simd4Float gmx_simdcall
 388 blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
 389 {
 390     return {
 391                vbslq_f32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
 392     };
 393 }
 394
 395 // Override for Neon-Asimd
 396 #if GMX_SIMD_ARM_NEON
 397 static inline float gmx_simdcall
 398 reduce(Simd4Float a)
 399 {
 400     float32x4_t x = a.simdInternal_;
 401     float32x4_t y = vextq_f32(x, x, 2);
 402
 403     x = vaddq_f32(x, y);
 404     y = vextq_f32(x, x, 1);
 405     x = vaddq_f32(x, y);
 406     return vgetq_lane_f32(x, 0);
 407 }
 408
 409 static inline float gmx_simdcall
 410 dotProduct(Simd4Float a, Simd4Float b)
 411 {
 412     Simd4Float c;
 413
 414     c = a * b;
 415     /* set 4th element to 0, then add all of them */
 416     c.simdInternal_ = vsetq_lane_f32(0.0F, c.simdInternal_, 3);
 417     return reduce(c);
 418 }
 419 #endif
 420
 421 }      // namespace gmx
 422
 423 #endif // GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H