Apply clang-format to source tree
[alexxy/gromacs.git] / src / gromacs / simd / impl_x86_mic / impl_x86_mic_util_float.h
index b766e1b444b5eee2936b93f7542fb3daf4ed4095..4911d74c5b3f346d631c656e3a86fca7348314f3 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -53,14 +53,13 @@ namespace gmx
 // On MIC it is better to use scatter operations, so we define the load routines
 // that use a SIMD offset variable first.
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadBySimdIntTranspose(const float *  base,
-                             SimdFInt32     simdoffset,
-                             SimdFloat *    v0,
-                             SimdFloat *    v1,
-                             SimdFloat *    v2,
-                             SimdFloat *    v3)
+template<int align>
+static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
+                                                             SimdFInt32   simdoffset,
+                                                             SimdFloat*   v0,
+                                                             SimdFloat*   v1,
+                                                             SimdFloat*   v2,
+                                                             SimdFloat*   v3)
 {
     assert(std::size_t(base) % 16 == 0);
     assert(align % 4 == 0);
@@ -81,18 +80,15 @@ gatherLoadBySimdIntTranspose(const float *  base,
         simdoffset = simdoffset * SimdFInt32(align);
     }
 
-    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
-    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
-    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
-    v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+3, sizeof(float));
+    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
+    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
+    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
+    v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 3, sizeof(float));
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadUBySimdIntTranspose(const float *  base,
-                              SimdFInt32     simdoffset,
-                              SimdFloat *    v0,
-                              SimdFloat *    v1)
+                   gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
 {
     // All instructions might be latency ~4 on MIC, so we use shifts where we
     // only need a single instruction (since the shift parameter is an immediate),
@@ -101,8 +97,9 @@ gatherLoadUBySimdIntTranspose(const float *  base,
     // which can take constants up to 8 in total.
     if (align == 2)
     {
-        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   align * sizeof(float));
-        v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, align * sizeof(float));
+        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, align * sizeof(float));
+        v1->simdInternal_ =
+                _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, align * sizeof(float));
     }
     else
     {
@@ -118,54 +115,46 @@ gatherLoadUBySimdIntTranspose(const float *  base,
         {
             simdoffset = simdoffset * SimdFInt32(align);
         }
-        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
-        v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
+        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
+        v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
     }
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadBySimdIntTranspose(const float *   base,
-                             SimdFInt32      simdoffset,
-                             SimdFloat *     v0,
-                             SimdFloat *     v1)
+                   gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
 {
     assert(std::size_t(base) % 8 == 0);
     assert(align % 2 == 0);
     gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
 }
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadTranspose(const float *        base,
-                    const std::int32_t   offset[],
-                    SimdFloat *          v0,
-                    SimdFloat *          v1,
-                    SimdFloat *          v2,
-                    SimdFloat *          v3)
+template<int align>
+static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
+                                                    const std::int32_t offset[],
+                                                    SimdFloat*         v0,
+                                                    SimdFloat*         v1,
+                                                    SimdFloat*         v2,
+                                                    SimdFloat*         v3)
 {
     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1, v2, v3);
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadTranspose(const float *        base,
-                    const std::int32_t   offset[],
-                    SimdFloat *          v0,
-                    SimdFloat *          v1)
+                   gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
 {
     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1);
 }
 
 static const int c_simdBestPairAlignmentFloat = 2;
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadUTranspose(const float *        base,
-                     const std::int32_t   offset[],
-                     SimdFloat *          v0,
-                     SimdFloat *          v1,
-                     SimdFloat *          v2)
+template<int align>
+static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
+                                                     const std::int32_t offset[],
+                                                     SimdFloat*         v0,
+                                                     SimdFloat*         v1,
+                                                     SimdFloat*         v2)
 {
     SimdFInt32 simdoffset;
 
@@ -189,19 +178,15 @@ gatherLoadUTranspose(const float *        base,
         simdoffset = simdoffset * SimdFInt32(align);
     }
 
-    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
-    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
-    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
+    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
+    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
+    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
 }
 
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterStoreU(float *              base,
-                       const std::int32_t   offset[],
-                       SimdFloat            v0,
-                       SimdFloat            v1,
-                       SimdFloat            v2)
+                   transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
     SimdFInt32 simdoffset;
 
@@ -225,23 +210,19 @@ transposeScatterStoreU(float *              base,
         simdoffset = simdoffset * SimdFInt32(align);
     }
 
-    _mm512_i32scatter_ps(base,   simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
-    _mm512_i32scatter_ps(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
-    _mm512_i32scatter_ps(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
+    _mm512_i32scatter_ps(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
+    _mm512_i32scatter_ps(base + 1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
+    _mm512_i32scatter_ps(base + 2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
 }
 
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterIncrU(float *              base,
-                      const std::int32_t   offset[],
-                      SimdFloat            v0,
-                      SimdFloat            v1,
-                      SimdFloat            v2)
+                   transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
-    alignas(GMX_SIMD_ALIGNMENT) float  rdata0[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float  rdata1[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float  rdata2[GMX_SIMD_FLOAT_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 
     store(rdata0, v0);
     store(rdata1, v1);
@@ -249,23 +230,19 @@ transposeScatterIncrU(float *              base,
 
     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
     {
-        base[ align * offset[i] + 0] += rdata0[i];
-        base[ align * offset[i] + 1] += rdata1[i];
-        base[ align * offset[i] + 2] += rdata2[i];
+        base[align * offset[i] + 0] += rdata0[i];
+        base[align * offset[i] + 1] += rdata1[i];
+        base[align * offset[i] + 2] += rdata2[i];
     }
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterDecrU(float *              base,
-                      const std::int32_t   offset[],
-                      SimdFloat            v0,
-                      SimdFloat            v1,
-                      SimdFloat            v2)
+                   transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
-    alignas(GMX_SIMD_ALIGNMENT) float  rdata0[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float  rdata1[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float  rdata2[GMX_SIMD_FLOAT_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
+    alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 
     store(rdata0, v0);
     store(rdata1, v1);
@@ -273,33 +250,30 @@ transposeScatterDecrU(float *              base,
 
     for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
     {
-        base[ align * offset[i] + 0] -= rdata0[i];
-        base[ align * offset[i] + 1] -= rdata1[i];
-        base[ align * offset[i] + 2] -= rdata2[i];
+        base[align * offset[i] + 0] -= rdata0[i];
+        base[align * offset[i] + 1] -= rdata1[i];
+        base[align * offset[i] + 2] -= rdata2[i];
     }
 }
 
-static inline void gmx_simdcall
-expandScalarsToTriplets(SimdFloat    scalar,
-                        SimdFloat *  triplets0,
-                        SimdFloat *  triplets1,
-                        SimdFloat *  triplets2)
+static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
+                                                        SimdFloat* triplets0,
+                                                        SimdFloat* triplets1,
+                                                        SimdFloat* triplets2)
 {
-    triplets0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
-                                                                           _mm512_castps_si512(scalar.simdInternal_)));
-    triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
-                                                                           _mm512_castps_si512(scalar.simdInternal_)));
-    triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
-                                                                           _mm512_castps_si512(scalar.simdInternal_)));
+    triplets0->simdInternal_ = _mm512_castsi512_ps(
+            _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
+                                    _mm512_castps_si512(scalar.simdInternal_)));
+    triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
+            _mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
+            _mm512_castps_si512(scalar.simdInternal_)));
+    triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
+            _mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
+            _mm512_castps_si512(scalar.simdInternal_)));
 }
 
 
-static inline float gmx_simdcall
-reduceIncr4ReturnSum(float *    m,
-                     SimdFloat  v0,
-                     SimdFloat  v1,
-                     SimdFloat  v2,
-                     SimdFloat  v3)
+static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 {
     float  f;
     __m512 t0, t1, t2, t3;
@@ -307,16 +281,19 @@ reduceIncr4ReturnSum(float *    m,
     assert(std::size_t(m) % 16 == 0);
 
     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v2.simdInternal_, _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v2.simdInternal_,
+                            _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
     t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
-    t1 = _mm512_mask_add_ps(t1, _mm512_int2mask(0xCCCC), v3.simdInternal_, _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
+    t1 = _mm512_mask_add_ps(t1, _mm512_int2mask(0xCCCC), v3.simdInternal_,
+                            _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
     t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
     t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
 
     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
     t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
 
-    t0 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+    t0 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE,
+                                _MM_BROADCAST_4X16, _MM_HINT_NONE);
     t0 = _mm512_add_ps(t0, t2);
     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
 
@@ -327,37 +304,35 @@ reduceIncr4ReturnSum(float *    m,
     return f;
 }
 
-static inline SimdFloat gmx_simdcall
-loadDualHsimd(const float * m0,
-              const float * m1)
+static inline SimdFloat gmx_simdcall loadDualHsimd(const float* m0, const float* m1)
 {
     assert(std::size_t(m0) % 32 == 0);
     assert(std::size_t(m1) % 32 == 0);
 
-    return _mm512_castpd_ps(_mm512_mask_extload_pd(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
-                                                   _mm512_int2mask(0xF0), reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    return _mm512_castpd_ps(_mm512_mask_extload_pd(
+            _mm512_extload_pd(reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE,
+                              _MM_BROADCAST_4X8, _MM_HINT_NONE),
+            _mm512_int2mask(0xF0), reinterpret_cast<const double*>(m1), _MM_UPCONV_PD_NONE,
+            _MM_BROADCAST_4X8, _MM_HINT_NONE));
 }
 
-static inline SimdFloat gmx_simdcall
-loadDuplicateHsimd(const float * m)
+static inline SimdFloat gmx_simdcall loadDuplicateHsimd(const float* m)
 {
     assert(std::size_t(m) % 32 == 0);
 
-    return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m),
+                                              _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 }
 
-static inline SimdFloat gmx_simdcall
-loadU1DualHsimd(const float * m)
+static inline SimdFloat gmx_simdcall loadU1DualHsimd(const float* m)
 {
-    return _mm512_mask_extload_ps(_mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), _mm512_int2mask(0xFF00),
-                                  m+1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
+    return _mm512_mask_extload_ps(
+            _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE),
+            _mm512_int2mask(0xFF00), m + 1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
 }
 
 
-static inline void gmx_simdcall
-storeDualHsimd(float *     m0,
-               float *     m1,
-               SimdFloat   a)
+static inline void gmx_simdcall storeDualHsimd(float* m0, float* m1, SimdFloat a)
 {
     __m512 t0;
 
@@ -368,10 +343,7 @@ storeDualHsimd(float *     m0,
     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
 }
 
-static inline void gmx_simdcall
-incrDualHsimd(float *     m0,
-              float *     m1,
-              SimdFloat   a)
+static inline void gmx_simdcall incrDualHsimd(float* m0, float* m1, SimdFloat a)
 {
     assert(std::size_t(m0) % 32 == 0);
     assert(std::size_t(m1) % 32 == 0);
@@ -379,38 +351,38 @@ incrDualHsimd(float *     m0,
     __m512 x;
 
     // Update lower half
-    x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE,
+                                           _MM_BROADCAST_4X8, _MM_HINT_NONE));
     x = _mm512_add_ps(x, a.simdInternal_);
     _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
 
     // Update upper half
-    x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m1), _MM_UPCONV_PD_NONE,
+                                           _MM_BROADCAST_4X8, _MM_HINT_NONE));
     x = _mm512_add_ps(x, a.simdInternal_);
     _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
 }
 
-static inline void gmx_simdcall
-decrHsimd(float *    m,
-          SimdFloat  a)
+static inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
 {
     __m512 t;
 
     assert(std::size_t(m) % 32 == 0);
 
-    t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE,
+                                           _MM_BROADCAST_4X8, _MM_HINT_NONE));
     a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
     t = _mm512_sub_ps(t, a.simdInternal_);
     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
 }
 
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadTransposeHsimd(const float *        base0,
-                         const float *        base1,
-                         const std::int32_t   offset[],
-                         SimdFloat *          v0,
-                         SimdFloat *          v1)
+template<int align>
+static inline void gmx_simdcall gatherLoadTransposeHsimd(const float*       base0,
+                                                         const float*       base1,
+                                                         const std::int32_t offset[],
+                                                         SimdFloat*         v0,
+                                                         SimdFloat*         v1)
 {
     __m512i idx0, idx1, idx;
     __m512  tmp1, tmp2;
@@ -434,10 +406,7 @@ gatherLoadTransposeHsimd(const float *        base0,
     v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
 }
 
-static inline float gmx_simdcall
-reduceIncr4ReturnSumHsimd(float *     m,
-                          SimdFloat   v0,
-                          SimdFloat   v1)
+static inline float gmx_simdcall reduceIncr4ReturnSumHsimd(float* m, SimdFloat v0, SimdFloat v1)
 {
     float  f;
     __m512 t0, t1;
@@ -445,11 +414,13 @@ reduceIncr4ReturnSumHsimd(float *     m,
     assert(std::size_t(m) % 32 == 0);
 
     t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v1.simdInternal_,
+                            _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
     t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
     t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
     t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
-    t1 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+    t1 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE,
+                                _MM_BROADCAST_4X16, _MM_HINT_NONE);
     t1 = _mm512_add_ps(t1, t0);
     _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
 
@@ -460,6 +431,6 @@ reduceIncr4ReturnSumHsimd(float *     m,
     return f;
 }
 
-}      // namespace gmx
+} // namespace gmx
 
 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H