Apply clang-format to source tree
[alexxy/gromacs.git] / src / gromacs / simd / impl_x86_sse2 / impl_x86_sse2_util_float.h
index 53bcaffc8a8762e5716ab4bdec9bc6d89cddd94a..0c4de5265abc8add58096c14a5ad05bc01b6ec1b 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 namespace gmx
 {
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadTranspose(const float *        base,
-                    const std::int32_t   offset[],
-                    SimdFloat *          v0,
-                    SimdFloat *          v1,
-                    SimdFloat *          v2,
-                    SimdFloat *          v3)
+template<int align>
+static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
+                                                    const std::int32_t offset[],
+                                                    SimdFloat*         v0,
+                                                    SimdFloat*         v1,
+                                                    SimdFloat*         v2,
+                                                    SimdFloat*         v3)
 {
     assert(std::size_t(base + align * offset[0]) % 16 == 0);
     assert(std::size_t(base + align * offset[1]) % 16 == 0);
     assert(std::size_t(base + align * offset[2]) % 16 == 0);
     assert(std::size_t(base + align * offset[3]) % 16 == 0);
 
-    v0->simdInternal_ = _mm_load_ps( base + align * offset[0] );
-    v1->simdInternal_ = _mm_load_ps( base + align * offset[1] );
-    v2->simdInternal_ = _mm_load_ps( base + align * offset[2] );
-    v3->simdInternal_ = _mm_load_ps( base + align * offset[3] );
+    v0->simdInternal_ = _mm_load_ps(base + align * offset[0]);
+    v1->simdInternal_ = _mm_load_ps(base + align * offset[1]);
+    v2->simdInternal_ = _mm_load_ps(base + align * offset[2]);
+    v3->simdInternal_ = _mm_load_ps(base + align * offset[3]);
 
     _MM_TRANSPOSE4_PS(v0->simdInternal_, v1->simdInternal_, v2->simdInternal_, v3->simdInternal_);
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadTranspose(const float *        base,
-                    const std::int32_t   offset[],
-                    SimdFloat *          v0,
-                    SimdFloat *          v1)
+                   gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
 {
     __m128 t1, t2;
 
-    v0->simdInternal_ = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[0] ) ));
-    v1->simdInternal_ = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[1] ) ));
-    t1                = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[2] ) ));
-    t2                = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[3] ) ));
-    t1                = _mm_unpacklo_ps(v0->simdInternal_, t1);
-    t2                = _mm_unpacklo_ps(v1->simdInternal_, t2);
+    v0->simdInternal_ =
+            _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[0])));
+    v1->simdInternal_ =
+            _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[1])));
+    t1 = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[2])));
+    t2 = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[3])));
+    t1 = _mm_unpacklo_ps(v0->simdInternal_, t1);
+    t2 = _mm_unpacklo_ps(v1->simdInternal_, t2);
     v0->simdInternal_ = _mm_unpacklo_ps(t1, t2);
     v1->simdInternal_ = _mm_unpackhi_ps(t1, t2);
 }
 
 static const int c_simdBestPairAlignmentFloat = 2;
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadUTranspose(const float *        base,
-                     const std::int32_t   offset[],
-                     SimdFloat *          v0,
-                     SimdFloat *          v1,
-                     SimdFloat *          v2)
+template<int align>
+static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
+                                                     const std::int32_t offset[],
+                                                     SimdFloat*         v0,
+                                                     SimdFloat*         v1,
+                                                     SimdFloat*         v2)
 {
     __m128 t1, t2, t3, t4, t5, t6, t7, t8;
 
     if (align % 4 != 0)
     {
         // general case, not aligned to 4-byte boundary
-        t1                = _mm_loadu_ps( base + align * offset[0] );
-        t2                = _mm_loadu_ps( base + align * offset[1] );
-        t3                = _mm_loadu_ps( base + align * offset[2] );
-        t4                = _mm_loadu_ps( base + align * offset[3] );
+        t1 = _mm_loadu_ps(base + align * offset[0]);
+        t2 = _mm_loadu_ps(base + align * offset[1]);
+        t3 = _mm_loadu_ps(base + align * offset[2]);
+        t4 = _mm_loadu_ps(base + align * offset[3]);
     }
     else
     {
         // aligned to 4-byte boundary or more
-        t1                = _mm_load_ps( base + align * offset[0] );
-        t2                = _mm_load_ps( base + align * offset[1] );
-        t3                = _mm_load_ps( base + align * offset[2] );
-        t4                = _mm_load_ps( base + align * offset[3] );
+        t1 = _mm_load_ps(base + align * offset[0]);
+        t2 = _mm_load_ps(base + align * offset[1]);
+        t3 = _mm_load_ps(base + align * offset[2]);
+        t4 = _mm_load_ps(base + align * offset[3]);
     }
-    t5                = _mm_unpacklo_ps(t1, t2);
-    t6                = _mm_unpacklo_ps(t3, t4);
-    t7                = _mm_unpackhi_ps(t1, t2);
-    t8                = _mm_unpackhi_ps(t3, t4);
-    *v0               = _mm_movelh_ps(t5, t6);
-    *v1               = _mm_movehl_ps(t6, t5);
-    *v2               = _mm_movelh_ps(t7, t8);
+    t5  = _mm_unpacklo_ps(t1, t2);
+    t6  = _mm_unpacklo_ps(t3, t4);
+    t7  = _mm_unpackhi_ps(t1, t2);
+    t8  = _mm_unpackhi_ps(t3, t4);
+    *v0 = _mm_movelh_ps(t5, t6);
+    *v1 = _mm_movehl_ps(t6, t5);
+    *v2 = _mm_movelh_ps(t7, t8);
 }
 
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterStoreU(float *              base,
-                       const std::int32_t   offset[],
-                       SimdFloat            v0,
-                       SimdFloat            v1,
-                       SimdFloat            v2)
+                   transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
     __m128 t1, t2;
 
     // general case, not aligned to 4-byte boundary
-    t1   = _mm_unpacklo_ps(v0.simdInternal_, v1.simdInternal_);
-    t2   = _mm_unpackhi_ps(v0.simdInternal_, v1.simdInternal_);
-    _mm_storel_pi( reinterpret_cast< __m64 *>( base + align * offset[0] ), t1);
+    t1 = _mm_unpacklo_ps(v0.simdInternal_, v1.simdInternal_);
+    t2 = _mm_unpackhi_ps(v0.simdInternal_, v1.simdInternal_);
+    _mm_storel_pi(reinterpret_cast<__m64*>(base + align * offset[0]), t1);
     _mm_store_ss(base + align * offset[0] + 2, v2.simdInternal_);
-    _mm_storeh_pi( reinterpret_cast< __m64 *>( base + align * offset[1] ), t1);
-    _mm_store_ss(base + align * offset[1] + 2, _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(1, 1, 1, 1)));
-    _mm_storel_pi( reinterpret_cast< __m64 *>( base + align * offset[2] ), t2);
-    _mm_store_ss(base + align * offset[2] + 2, _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(2, 2, 2, 2)));
-    _mm_storeh_pi( reinterpret_cast< __m64 *>( base + align * offset[3] ), t2);
-    _mm_store_ss(base + align * offset[3] + 2, _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(3, 3, 3, 3)));
+    _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1]), t1);
+    _mm_store_ss(base + align * offset[1] + 2,
+                 _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(1, 1, 1, 1)));
+    _mm_storel_pi(reinterpret_cast<__m64*>(base + align * offset[2]), t2);
+    _mm_store_ss(base + align * offset[2] + 2,
+                 _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(2, 2, 2, 2)));
+    _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3]), t2);
+    _mm_store_ss(base + align * offset[3] + 2,
+                 _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(3, 3, 3, 3)));
 }
 
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterIncrU(float *              base,
-                      const std::int32_t   offset[],
-                      SimdFloat            v0,
-                      SimdFloat            v1,
-                      SimdFloat            v2)
+                   transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 
     if (align < 4)
     {
-        t5          = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
-        t6          = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
-        t7          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
-        t8          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
-        t9          = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
-        t10         = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
-
-        t1          = _mm_load_ss(base + align * offset[0]);
-        t1          = _mm_loadh_pi(t1, reinterpret_cast< __m64 *>(base + align * offset[0] + 1));
-        t1          = _mm_add_ps(t1, t7);
+        t5  = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
+        t6  = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
+        t7  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
+        t8  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
+        t9  = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
+        t10 = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
+
+        t1 = _mm_load_ss(base + align * offset[0]);
+        t1 = _mm_loadh_pi(t1, reinterpret_cast<__m64*>(base + align * offset[0] + 1));
+        t1 = _mm_add_ps(t1, t7);
         _mm_store_ss(base + align * offset[0], t1);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[0] + 1), t1);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[0] + 1), t1);
 
-        t2          = _mm_load_ss(base + align * offset[1]);
-        t2          = _mm_loadh_pi(t2, reinterpret_cast< __m64 *>(base + align * offset[1] + 1));
-        t2          = _mm_add_ps(t2, t8);
+        t2 = _mm_load_ss(base + align * offset[1]);
+        t2 = _mm_loadh_pi(t2, reinterpret_cast<__m64*>(base + align * offset[1] + 1));
+        t2 = _mm_add_ps(t2, t8);
         _mm_store_ss(base + align * offset[1], t2);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[1] + 1), t2);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 1), t2);
 
-        t3          = _mm_load_ss(base + align * offset[2]);
-        t3          = _mm_loadh_pi(t3, reinterpret_cast< __m64 *>(base + align * offset[2] + 1));
-        t3          = _mm_add_ps(t3, t9);
+        t3 = _mm_load_ss(base + align * offset[2]);
+        t3 = _mm_loadh_pi(t3, reinterpret_cast<__m64*>(base + align * offset[2] + 1));
+        t3 = _mm_add_ps(t3, t9);
         _mm_store_ss(base + align * offset[2], t3);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[2] + 1), t3);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[2] + 1), t3);
 
-        t4          = _mm_load_ss(base + align * offset[3]);
-        t4          = _mm_loadh_pi(t4, reinterpret_cast< __m64 *>(base + align * offset[3] + 1));
-        t4          = _mm_add_ps(t4, t10);
+        t4 = _mm_load_ss(base + align * offset[3]);
+        t4 = _mm_loadh_pi(t4, reinterpret_cast<__m64*>(base + align * offset[3] + 1));
+        t4 = _mm_add_ps(t4, t10);
         _mm_store_ss(base + align * offset[3], t4);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[3] + 1), t4);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3] + 1), t4);
     }
     else
     {
         // Extra elements means we can use full width-4 load/store operations
 
-        t1  = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
-        t2  = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
-        t3  = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
-        t4  = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
-        t5  = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
-        t6  = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
-        t7  = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
-        t8  = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
+        t1 = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
+        t2 = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
+        t3 = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
+        t4 = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
+        t5 = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
+        t6 = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
+        t7 = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
+        t8 = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
 
         if (align % 4 == 0)
         {
@@ -221,70 +213,70 @@ transposeScatterIncrU(float *              base,
         else
         {
             // alignment >=5, but not a multiple of 4
-            _mm_storeu_ps(base + align * offset[0], _mm_add_ps(_mm_loadu_ps(base + align * offset[0]), t5));
-            _mm_storeu_ps(base + align * offset[1], _mm_add_ps(_mm_loadu_ps(base + align * offset[1]), t6));
-            _mm_storeu_ps(base + align * offset[2], _mm_add_ps(_mm_loadu_ps(base + align * offset[2]), t7));
-            _mm_storeu_ps(base + align * offset[3], _mm_add_ps(_mm_loadu_ps(base + align * offset[3]), t8));
+            _mm_storeu_ps(base + align * offset[0],
+                          _mm_add_ps(_mm_loadu_ps(base + align * offset[0]), t5));
+            _mm_storeu_ps(base + align * offset[1],
+                          _mm_add_ps(_mm_loadu_ps(base + align * offset[1]), t6));
+            _mm_storeu_ps(base + align * offset[2],
+                          _mm_add_ps(_mm_loadu_ps(base + align * offset[2]), t7));
+            _mm_storeu_ps(base + align * offset[3],
+                          _mm_add_ps(_mm_loadu_ps(base + align * offset[3]), t8));
         }
     }
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-transposeScatterDecrU(float *              base,
-                      const std::int32_t   offset[],
-                      SimdFloat            v0,
-                      SimdFloat            v1,
-                      SimdFloat            v2)
+                   transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 {
     // This implementation is identical to the increment version, apart from using subtraction instead
     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 
     if (align < 4)
     {
-        t5          = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
-        t6          = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
-        t7          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
-        t8          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
-        t9          = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
-        t10         = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
-
-        t1          = _mm_load_ss(base + align * offset[0]);
-        t1          = _mm_loadh_pi(t1, reinterpret_cast< __m64 *>(base + align * offset[0] + 1));
-        t1          = _mm_sub_ps(t1, t7);
+        t5  = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
+        t6  = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
+        t7  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
+        t8  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
+        t9  = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
+        t10 = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
+
+        t1 = _mm_load_ss(base + align * offset[0]);
+        t1 = _mm_loadh_pi(t1, reinterpret_cast<__m64*>(base + align * offset[0] + 1));
+        t1 = _mm_sub_ps(t1, t7);
         _mm_store_ss(base + align * offset[0], t1);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[0] + 1), t1);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[0] + 1), t1);
 
-        t2          = _mm_load_ss(base + align * offset[1]);
-        t2          = _mm_loadh_pi(t2, reinterpret_cast< __m64 *>(base + align * offset[1] + 1));
-        t2          = _mm_sub_ps(t2, t8);
+        t2 = _mm_load_ss(base + align * offset[1]);
+        t2 = _mm_loadh_pi(t2, reinterpret_cast<__m64*>(base + align * offset[1] + 1));
+        t2 = _mm_sub_ps(t2, t8);
         _mm_store_ss(base + align * offset[1], t2);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[1] + 1), t2);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 1), t2);
 
-        t3          = _mm_load_ss(base + align * offset[2]);
-        t3          = _mm_loadh_pi(t3, reinterpret_cast< __m64 *>(base + align * offset[2] + 1));
-        t3          = _mm_sub_ps(t3, t9);
+        t3 = _mm_load_ss(base + align * offset[2]);
+        t3 = _mm_loadh_pi(t3, reinterpret_cast<__m64*>(base + align * offset[2] + 1));
+        t3 = _mm_sub_ps(t3, t9);
         _mm_store_ss(base + align * offset[2], t3);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[2] + 1), t3);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[2] + 1), t3);
 
-        t4          = _mm_load_ss(base + align * offset[3]);
-        t4          = _mm_loadh_pi(t4, reinterpret_cast< __m64 *>(base + align * offset[3] + 1));
-        t4          = _mm_sub_ps(t4, t10);
+        t4 = _mm_load_ss(base + align * offset[3]);
+        t4 = _mm_loadh_pi(t4, reinterpret_cast<__m64*>(base + align * offset[3] + 1));
+        t4 = _mm_sub_ps(t4, t10);
         _mm_store_ss(base + align * offset[3], t4);
-        _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[3] + 1), t4);
+        _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3] + 1), t4);
     }
     else
     {
         // Extra elements means we can use full width-4 load/store operations
 
-        t1  = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
-        t2  = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
-        t3  = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
-        t4  = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
-        t5  = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
-        t6  = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
-        t7  = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
-        t8  = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
+        t1 = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
+        t2 = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
+        t3 = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
+        t4 = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
+        t5 = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
+        t6 = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
+        t7 = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
+        t8 = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
 
         if (align % 4 == 0)
         {
@@ -297,37 +289,42 @@ transposeScatterDecrU(float *              base,
         else
         {
             // alignment >=5, but not a multiple of 4
-            _mm_storeu_ps(base + align * offset[0], _mm_sub_ps(_mm_loadu_ps(base + align * offset[0]), t5));
-            _mm_storeu_ps(base + align * offset[1], _mm_sub_ps(_mm_loadu_ps(base + align * offset[1]), t6));
-            _mm_storeu_ps(base + align * offset[2], _mm_sub_ps(_mm_loadu_ps(base + align * offset[2]), t7));
-            _mm_storeu_ps(base + align * offset[3], _mm_sub_ps(_mm_loadu_ps(base + align * offset[3]), t8));
+            _mm_storeu_ps(base + align * offset[0],
+                          _mm_sub_ps(_mm_loadu_ps(base + align * offset[0]), t5));
+            _mm_storeu_ps(base + align * offset[1],
+                          _mm_sub_ps(_mm_loadu_ps(base + align * offset[1]), t6));
+            _mm_storeu_ps(base + align * offset[2],
+                          _mm_sub_ps(_mm_loadu_ps(base + align * offset[2]), t7));
+            _mm_storeu_ps(base + align * offset[3],
+                          _mm_sub_ps(_mm_loadu_ps(base + align * offset[3]), t8));
         }
     }
 }
 
 // Override for AVX-128-FMA and higher
 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
-static inline void gmx_simdcall
-expandScalarsToTriplets(SimdFloat    scalar,
-                        SimdFloat *  triplets0,
-                        SimdFloat *  triplets1,
-                        SimdFloat *  triplets2)
+static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
+                                                        SimdFloat* triplets0,
+                                                        SimdFloat* triplets1,
+                                                        SimdFloat* triplets2)
 {
-    triplets0->simdInternal_ = _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(1, 0, 0, 0));
-    triplets1->simdInternal_ = _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(2, 2, 1, 1));
-    triplets2->simdInternal_ = _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(3, 3, 3, 2));
+    triplets0->simdInternal_ =
+            _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(1, 0, 0, 0));
+    triplets1->simdInternal_ =
+            _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(2, 2, 1, 1));
+    triplets2->simdInternal_ =
+            _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(3, 3, 3, 2));
 }
 #endif
 
 
-template <int align>
-static inline void gmx_simdcall
-gatherLoadBySimdIntTranspose(const float *  base,
-                             SimdFInt32     offset,
-                             SimdFloat *    v0,
-                             SimdFloat *    v1,
-                             SimdFloat *    v2,
-                             SimdFloat *    v3)
+template<int align>
+static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
+                                                             SimdFInt32   offset,
+                                                             SimdFloat*   v0,
+                                                             SimdFloat*   v1,
+                                                             SimdFloat*   v2,
+                                                             SimdFloat*   v3)
 {
     // For present-generation x86 CPUs it appears to be faster to simply
     // store the SIMD integer to memory and then use the normal load operations.
@@ -335,16 +332,13 @@ gatherLoadBySimdIntTranspose(const float *  base,
     // the alignment scaling can often be done as part of the load instruction
     // (which is even cheaper than doing it in SIMD registers).
     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
-    _mm_store_si128( (__m128i *)ioffset, offset.simdInternal_);
+    _mm_store_si128((__m128i*)ioffset, offset.simdInternal_);
     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 }
 
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadBySimdIntTranspose(const float *   base,
-                             SimdFInt32      offset,
-                             SimdFloat *     v0,
-                             SimdFloat *     v1)
+                   gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 {
     // For present-generation x86 CPUs it appears to be faster to simply
     // store the SIMD integer to memory and then use the normal load operations.
@@ -352,18 +346,14 @@ gatherLoadBySimdIntTranspose(const float *   base,
     // the alignment scaling can often be done as part of the load instruction
     // (which is even cheaper than doing it in SIMD registers).
     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
-    _mm_store_si128( (__m128i *)ioffset, offset.simdInternal_);
+    _mm_store_si128((__m128i*)ioffset, offset.simdInternal_);
     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 }
 
 
-
-template <int align>
+template<int align>
 static inline void gmx_simdcall
-gatherLoadUBySimdIntTranspose(const float *  base,
-                              SimdFInt32     offset,
-                              SimdFloat *    v0,
-                              SimdFloat *    v1)
+                   gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 {
     // For present-generation x86 CPUs it appears to be faster to simply
     // store the SIMD integer to memory and then use the normal load operations.
@@ -371,18 +361,13 @@ gatherLoadUBySimdIntTranspose(const float *  base,
     // the alignment scaling can often be done as part of the load instruction
     // (which is even cheaper than doing it in SIMD registers).
     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
-    _mm_store_si128( (__m128i *)ioffset, offset.simdInternal_);
+    _mm_store_si128((__m128i*)ioffset, offset.simdInternal_);
     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 }
 
 // Override for AVX-128-FMA and higher
 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
-static inline float gmx_simdcall
-reduceIncr4ReturnSum(float *    m,
-                     SimdFloat  v0,
-                     SimdFloat  v1,
-                     SimdFloat  v2,
-                     SimdFloat  v3)
+static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 {
     _MM_TRANSPOSE4_PS(v0.simdInternal_, v1.simdInternal_, v2.simdInternal_, v3.simdInternal_);
     v0.simdInternal_ = _mm_add_ps(v0.simdInternal_, v1.simdInternal_);
@@ -393,12 +378,13 @@ reduceIncr4ReturnSum(float *    m,
     assert(std::size_t(m) % 16 == 0);
     _mm_store_ps(m, v2.simdInternal_);
 
-    __m128 b = _mm_add_ps(v0.simdInternal_, _mm_shuffle_ps(v0.simdInternal_, v0.simdInternal_, _MM_SHUFFLE(1, 0, 3, 2)));
-    b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
-    return *reinterpret_cast<float *>(&b);
+    __m128 b = _mm_add_ps(v0.simdInternal_,
+                          _mm_shuffle_ps(v0.simdInternal_, v0.simdInternal_, _MM_SHUFFLE(1, 0, 3, 2)));
+    b        = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *reinterpret_cast<float*>(&b);
 }
 #endif
 
-}      // namespace gmx
+} // namespace gmx
 
 #endif // GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H