Apply clang-format to source tree
[alexxy/gromacs.git] / src / gromacs / simd / impl_arm_neon / impl_arm_neon_util_float.h
index 81c3ff0ea2a668ac0aff474ea51d145a50c7b71a..8918e8fe9542ca3957f11113626ad8fa3f48b2d8 100644 (file)
 namespace gmx
 {
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadTranspose(const float *        base,
-                    const std::int32_t   offset[],
-                    SimdFloat *          v0,
-                    SimdFloat *          v1,
-                    SimdFloat *          v2,
-                    SimdFloat *          v3)
+template<int align>
+static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
+                                                    const std::int32_t offset[],
+                                                    SimdFloat*         v0,
+                                                    SimdFloat*         v1,
+                                                    SimdFloat*         v2,
+                                                    SimdFloat*         v3)
 {
     assert(std::size_t(offset) % 16 == 0);
     assert(std::size_t(base) % 16 == 0);
@@ -66,91 +65,83 @@ gatherLoadTranspose(const float *        base,
 
     // Unfortunately we cannot use the beautiful Neon structured load
     // instructions since the data comes from four different memory locations.
-    float32x4x2_t  t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
-    float32x4x2_t  t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
-    float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
-    float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+    float32x4x2_t t0 =
+            vuzpq_f32(vld1q_f32(base + align * offset[0]), vld1q_f32(base + align * offset[2]));
+    float32x4x2_t t1 =
+            vuzpq_f32(vld1q_f32(base + align * offset[1]), vld1q_f32(base + align * offset[3]));
+    float32x4x2_t t2  = vtrnq_f32(t0.val[0], t1.val[0]);
+    float32x4x2_t t3  = vtrnq_f32(t0.val[1], t1.val[1]);
     v0->simdInternal_ = t2.val[0];
     v1->simdInternal_ = t3.val[0];
     v2->simdInternal_ = t2.val[1];
     v3->simdInternal_ = t3.val[1];
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadTranspose(const float *        base,
-                    const std::int32_t   offset[],
-                    SimdFloat *          v0,
-                    SimdFloat *          v1)
+                   gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
 {
     assert(std::size_t(offset) % 16 == 0);
     assert(std::size_t(base) % 8 == 0);
     assert(align % 2 == 0);
 
-    v0->simdInternal_  = vcombine_f32(vld1_f32( base + align * offset[0] ),
-                                      vld1_f32( base + align * offset[2] ));
-    v1->simdInternal_  = vcombine_f32(vld1_f32( base + align * offset[1] ),
-                                      vld1_f32( base + align * offset[3] ));
+    v0->simdInternal_ =
+            vcombine_f32(vld1_f32(base + align * offset[0]), vld1_f32(base + align * offset[2]));
+    v1->simdInternal_ =
+            vcombine_f32(vld1_f32(base + align * offset[1]), vld1_f32(base + align * offset[3]));
 
-    float32x4x2_t tmp  = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
+    float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
 
-    v0->simdInternal_  = tmp.val[0];
-    v1->simdInternal_  = tmp.val[1];
+    v0->simdInternal_ = tmp.val[0];
+    v1->simdInternal_ = tmp.val[1];
 }
 
 static const int c_simdBestPairAlignmentFloat = 2;
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadUTranspose(const float *        base,
-                     const std::int32_t   offset[],
-                     SimdFloat *          v0,
-                     SimdFloat *          v1,
-                     SimdFloat *          v2)
+template<int align>
+static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
+                                                     const std::int32_t offset[],
+                                                     SimdFloat*         v0,
+                                                     SimdFloat*         v1,
+                                                     SimdFloat*         v2)
 {
     assert(std::size_t(offset) % 16 == 0);
 
-    float32x4x2_t  t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
-    float32x4x2_t  t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
-    float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
-    float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+    float32x4x2_t t0 =
+            vuzpq_f32(vld1q_f32(base + align * offset[0]), vld1q_f32(base + align * offset[2]));
+    float32x4x2_t t1 =
+            vuzpq_f32(vld1q_f32(base + align * offset[1]), vld1q_f32(base + align * offset[3]));
+    float32x4x2_t t2  = vtrnq_f32(t0.val[0], t1.val[0]);
+    float32x4x2_t t3  = vtrnq_f32(t0.val[1], t1.val[1]);
     v0->simdInternal_ = t2.val[0];
     v1->simdInternal_ = t3.val[0];
     v2->simdInternal_ = t2.val[1];
 }
 
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterStoreU(float *              base,
-                       const std::int32_t   offset[],
-                       SimdFloat            v0,
-                       SimdFloat            v1,
-                       SimdFloat            v2)
+                   transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
     assert(std::size_t(offset) % 16 == 0);
 
     float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
 
-    vst1_f32( base + align * offset[0], vget_low_f32(tmp.val[0]) );
-    vst1_f32( base + align * offset[1], vget_low_f32(tmp.val[1]) );
-    vst1_f32( base + align * offset[2], vget_high_f32(tmp.val[0]) );
-    vst1_f32( base + align * offset[3], vget_high_f32(tmp.val[1]) );
+    vst1_f32(base + align * offset[0], vget_low_f32(tmp.val[0]));
+    vst1_f32(base + align * offset[1], vget_low_f32(tmp.val[1]));
+    vst1_f32(base + align * offset[2], vget_high_f32(tmp.val[0]));
+    vst1_f32(base + align * offset[3], vget_high_f32(tmp.val[1]));
 
-    vst1q_lane_f32( base + align * offset[0] + 2, v2.simdInternal_, 0);
-    vst1q_lane_f32( base + align * offset[1] + 2, v2.simdInternal_, 1);
-    vst1q_lane_f32( base + align * offset[2] + 2, v2.simdInternal_, 2);
-    vst1q_lane_f32( base + align * offset[3] + 2, v2.simdInternal_, 3);
+    vst1q_lane_f32(base + align * offset[0] + 2, v2.simdInternal_, 0);
+    vst1q_lane_f32(base + align * offset[1] + 2, v2.simdInternal_, 1);
+    vst1q_lane_f32(base + align * offset[2] + 2, v2.simdInternal_, 2);
+    vst1q_lane_f32(base + align * offset[3] + 2, v2.simdInternal_, 3);
 }
 
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterIncrU(float *              base,
-                      const std::int32_t   offset[],
-                      SimdFloat            v0,
-                      SimdFloat            v1,
-                      SimdFloat            v2)
+                   transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
     assert(std::size_t(offset) % 16 == 0);
 
@@ -166,31 +157,31 @@ transposeScatterIncrU(float *              base,
 
         t0 = vadd_f32(t0, vld1_f32(base + align * offset[0]));
         vst1_f32(base + align * offset[0], t0);
-        base[ align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0);
+        base[align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0);
 
         t1 = vadd_f32(t1, vld1_f32(base + align * offset[1]));
         vst1_f32(base + align * offset[1], t1);
-        base[ align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1);
+        base[align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1);
 
         t2 = vadd_f32(t2, vld1_f32(base + align * offset[2]));
         vst1_f32(base + align * offset[2], t2);
-        base[ align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2);
+        base[align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2);
 
         t3 = vadd_f32(t3, vld1_f32(base + align * offset[3]));
         vst1_f32(base + align * offset[3], t3);
-        base[ align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3);
+        base[align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3);
     }
     else
     {
         // Extra elements means we can use full width-4 load/store operations
-        float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
-        float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
-        float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
-        float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
-        float32x4_t    t4 = t2.val[0];
-        float32x4_t    t5 = t3.val[0];
-        float32x4_t    t6 = t2.val[1];
-        float32x4_t    t7 = t3.val[1];
+        float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
+        float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
+        float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+        float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+        float32x4_t   t4 = t2.val[0];
+        float32x4_t   t5 = t3.val[0];
+        float32x4_t   t6 = t2.val[1];
+        float32x4_t   t7 = t3.val[1];
 
         vst1q_f32(base + align * offset[0], vaddq_f32(t4, vld1q_f32(base + align * offset[0])));
         vst1q_f32(base + align * offset[1], vaddq_f32(t5, vld1q_f32(base + align * offset[1])));
@@ -199,13 +190,9 @@ transposeScatterIncrU(float *              base,
     }
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterDecrU(float *              base,
-                      const std::int32_t   offset[],
-                      SimdFloat            v0,
-                      SimdFloat            v1,
-                      SimdFloat            v2)
+                   transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
     assert(std::size_t(offset) % 16 == 0);
 
@@ -221,31 +208,31 @@ transposeScatterDecrU(float *              base,
 
         t0 = vsub_f32(vld1_f32(base + align * offset[0]), t0);
         vst1_f32(base + align * offset[0], t0);
-        base[ align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0);
+        base[align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0);
 
         t1 = vsub_f32(vld1_f32(base + align * offset[1]), t1);
         vst1_f32(base + align * offset[1], t1);
-        base[ align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1);
+        base[align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1);
 
         t2 = vsub_f32(vld1_f32(base + align * offset[2]), t2);
         vst1_f32(base + align * offset[2], t2);
-        base[ align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2);
+        base[align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2);
 
         t3 = vsub_f32(vld1_f32(base + align * offset[3]), t3);
         vst1_f32(base + align * offset[3], t3);
-        base[ align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3);
+        base[align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3);
     }
     else
     {
         // Extra elements means we can use full width-4 load/store operations
-        float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
-        float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
-        float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
-        float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
-        float32x4_t    t4 = t2.val[0];
-        float32x4_t    t5 = t3.val[0];
-        float32x4_t    t6 = t2.val[1];
-        float32x4_t    t7 = t3.val[1];
+        float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
+        float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
+        float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+        float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+        float32x4_t   t4 = t2.val[0];
+        float32x4_t   t5 = t3.val[0];
+        float32x4_t   t6 = t2.val[1];
+        float32x4_t   t7 = t3.val[1];
 
         vst1q_f32(base + align * offset[0], vsubq_f32(vld1q_f32(base + align * offset[0]), t4));
         vst1q_f32(base + align * offset[1], vsubq_f32(vld1q_f32(base + align * offset[1]), t5));
@@ -254,11 +241,10 @@ transposeScatterDecrU(float *              base,
     }
 }
 
-static inline void gmx_simdcall
-expandScalarsToTriplets(SimdFloat    scalar,
-                        SimdFloat *  triplets0,
-                        SimdFloat *  triplets1,
-                        SimdFloat *  triplets2)
+static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
+                                                        SimdFloat* triplets0,
+                                                        SimdFloat* triplets1,
+                                                        SimdFloat* triplets2)
 {
     float32x2_t lo, hi;
     float32x4_t t0, t1, t2, t3;
@@ -277,16 +263,15 @@ expandScalarsToTriplets(SimdFloat    scalar,
 }
 
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadBySimdIntTranspose(const float *  base,
-                             SimdFInt32     offset,
-                             SimdFloat *    v0,
-                             SimdFloat *    v1,
-                             SimdFloat *    v2,
-                             SimdFloat *    v3)
+template<int align>
+static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
+                                                             SimdFInt32   offset,
+                                                             SimdFloat*   v0,
+                                                             SimdFloat*   v1,
+                                                             SimdFloat*   v2,
+                                                             SimdFloat*   v3)
 {
-    alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ioffset[GMX_SIMD_FINT32_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 
     assert(std::size_t(base) % 16 == 0);
     assert(align % 4 == 0);
@@ -295,53 +280,41 @@ gatherLoadBySimdIntTranspose(const float *  base,
     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadBySimdIntTranspose(const float *   base,
-                             SimdFInt32      offset,
-                             SimdFloat *     v0,
-                             SimdFloat *     v1)
+                   gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 {
-    alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ioffset[GMX_SIMD_FINT32_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 
     store(ioffset, offset);
     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 }
 
 
-
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadUBySimdIntTranspose(const float *  base,
-                              SimdFInt32     offset,
-                              SimdFloat *    v0,
-                              SimdFloat *    v1)
+                   gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 {
-    alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ioffset[GMX_SIMD_FINT32_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 
     store(ioffset, offset);
-    v0->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[0] ),
-                                     vld1_f32( base + align * ioffset[2] ));
-    v1->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[1] ),
-                                     vld1_f32( base + align * ioffset[3] ));
-    float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_ );
+    v0->simdInternal_ =
+            vcombine_f32(vld1_f32(base + align * ioffset[0]), vld1_f32(base + align * ioffset[2]));
+    v1->simdInternal_ =
+            vcombine_f32(vld1_f32(base + align * ioffset[1]), vld1_f32(base + align * ioffset[3]));
+    float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
     v0->simdInternal_ = tmp.val[0];
     v1->simdInternal_ = tmp.val[1];
 }
 
-static inline float gmx_simdcall
-reduceIncr4ReturnSum(float *    m,
-                     SimdFloat  v0,
-                     SimdFloat  v1,
-                     SimdFloat  v2,
-                     SimdFloat  v3)
+static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 {
     assert(std::size_t(m) % 16 == 0);
 
-    float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
-    float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_);
-    float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
-    float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+    float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
+    float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_);
+    float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+    float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
     v0.simdInternal_ = t2.val[0];
     v1.simdInternal_ = t3.val[0];
     v2.simdInternal_ = t2.val[1];
@@ -356,6 +329,6 @@ reduceIncr4ReturnSum(float *    m,
     return reduce(v0);
 }
 
-}      // namespace gmx
+} // namespace gmx
 
 #endif // GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H