src/gromacs/simd/impl_x86_sse2/impl_x86_sse2_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2018,2019,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H
  36 #define GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H
  37
  38 #include "config.h"
  39
  40 #include <cassert>
  41 #include <cstddef>
  42 #include <cstdint>
  43
  44 #include <emmintrin.h>
  45
  46 #include "gromacs/utility/basedefinitions.h"
  47
  48 #include "impl_x86_sse2_simd_float.h"
  49
  50
  51 namespace gmx
  52 {
  53
  54 template<int align>
  55 static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
  56                                                     const std::int32_t offset[],
  57                                                     SimdFloat*         v0,
  58                                                     SimdFloat*         v1,
  59                                                     SimdFloat*         v2,
  60                                                     SimdFloat*         v3)
  61 {
  62     assert(std::size_t(base + align * offset[0]) % 16 == 0);
  63     assert(std::size_t(base + align * offset[1]) % 16 == 0);
  64     assert(std::size_t(base + align * offset[2]) % 16 == 0);
  65     assert(std::size_t(base + align * offset[3]) % 16 == 0);
  66
  67     v0->simdInternal_ = _mm_load_ps(base + align * offset[0]);
  68     v1->simdInternal_ = _mm_load_ps(base + align * offset[1]);
  69     v2->simdInternal_ = _mm_load_ps(base + align * offset[2]);
  70     v3->simdInternal_ = _mm_load_ps(base + align * offset[3]);
  71
  72     _MM_TRANSPOSE4_PS(v0->simdInternal_, v1->simdInternal_, v2->simdInternal_, v3->simdInternal_);
  73 }
  74
  75 template<int align>
  76 static inline void gmx_simdcall
  77 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
  78 {
  79     __m128 t1, t2;
  80
  81     v0->simdInternal_ =
  82             _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[0])));
  83     v1->simdInternal_ =
  84             _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[1])));
  85     t1 = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[2])));
  86     t2 = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(base + align * offset[3])));
  87     t1 = _mm_unpacklo_ps(v0->simdInternal_, t1);
  88     t2 = _mm_unpacklo_ps(v1->simdInternal_, t2);
  89     v0->simdInternal_ = _mm_unpacklo_ps(t1, t2);
  90     v1->simdInternal_ = _mm_unpackhi_ps(t1, t2);
  91 }
  92
  93 static const int c_simdBestPairAlignmentFloat = 2;
  94
  95 template<int align>
  96 static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
  97                                                      const std::int32_t offset[],
  98                                                      SimdFloat*         v0,
  99                                                      SimdFloat*         v1,
 100                                                      SimdFloat*         v2)
 101 {
 102     __m128 t1, t2, t3, t4, t5, t6, t7, t8;
 103
 104     if (align % 4 != 0)
 105     {
 106         // general case, not aligned to 4-byte boundary
 107         t1 = _mm_loadu_ps(base + align * offset[0]);
 108         t2 = _mm_loadu_ps(base + align * offset[1]);
 109         t3 = _mm_loadu_ps(base + align * offset[2]);
 110         t4 = _mm_loadu_ps(base + align * offset[3]);
 111     }
 112     else
 113     {
 114         // aligned to 4-byte boundary or more
 115         t1 = _mm_load_ps(base + align * offset[0]);
 116         t2 = _mm_load_ps(base + align * offset[1]);
 117         t3 = _mm_load_ps(base + align * offset[2]);
 118         t4 = _mm_load_ps(base + align * offset[3]);
 119     }
 120     t5  = _mm_unpacklo_ps(t1, t2);
 121     t6  = _mm_unpacklo_ps(t3, t4);
 122     t7  = _mm_unpackhi_ps(t1, t2);
 123     t8  = _mm_unpackhi_ps(t3, t4);
 124     *v0 = _mm_movelh_ps(t5, t6);
 125     *v1 = _mm_movehl_ps(t6, t5);
 126     *v2 = _mm_movelh_ps(t7, t8);
 127 }
 128
 129
 130 template<int align>
 131 static inline void gmx_simdcall
 132 transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 133 {
 134     __m128 t1, t2;
 135
 136     // general case, not aligned to 4-byte boundary
 137     t1 = _mm_unpacklo_ps(v0.simdInternal_, v1.simdInternal_);
 138     t2 = _mm_unpackhi_ps(v0.simdInternal_, v1.simdInternal_);
 139     _mm_storel_pi(reinterpret_cast<__m64*>(base + align * offset[0]), t1);
 140     _mm_store_ss(base + align * offset[0] + 2, v2.simdInternal_);
 141     _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1]), t1);
 142     _mm_store_ss(base + align * offset[1] + 2,
 143                  _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(1, 1, 1, 1)));
 144     _mm_storel_pi(reinterpret_cast<__m64*>(base + align * offset[2]), t2);
 145     _mm_store_ss(base + align * offset[2] + 2,
 146                  _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(2, 2, 2, 2)));
 147     _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3]), t2);
 148     _mm_store_ss(base + align * offset[3] + 2,
 149                  _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(3, 3, 3, 3)));
 150 }
 151
 152
 153 template<int align>
 154 static inline void gmx_simdcall
 155 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 156 {
 157     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 158
 159     if (align < 4)
 160     {
 161         t5  = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
 162         t6  = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
 163         t7  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
 164         t8  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
 165         t9  = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
 166         t10 = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
 167
 168         t1 = _mm_load_ss(base + align * offset[0]);
 169         t1 = _mm_loadh_pi(t1, reinterpret_cast<__m64*>(base + align * offset[0] + 1));
 170         t1 = _mm_add_ps(t1, t7);
 171         _mm_store_ss(base + align * offset[0], t1);
 172         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[0] + 1), t1);
 173
 174         t2 = _mm_load_ss(base + align * offset[1]);
 175         t2 = _mm_loadh_pi(t2, reinterpret_cast<__m64*>(base + align * offset[1] + 1));
 176         t2 = _mm_add_ps(t2, t8);
 177         _mm_store_ss(base + align * offset[1], t2);
 178         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 1), t2);
 179
 180         t3 = _mm_load_ss(base + align * offset[2]);
 181         t3 = _mm_loadh_pi(t3, reinterpret_cast<__m64*>(base + align * offset[2] + 1));
 182         t3 = _mm_add_ps(t3, t9);
 183         _mm_store_ss(base + align * offset[2], t3);
 184         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[2] + 1), t3);
 185
 186         t4 = _mm_load_ss(base + align * offset[3]);
 187         t4 = _mm_loadh_pi(t4, reinterpret_cast<__m64*>(base + align * offset[3] + 1));
 188         t4 = _mm_add_ps(t4, t10);
 189         _mm_store_ss(base + align * offset[3], t4);
 190         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3] + 1), t4);
 191     }
 192     else
 193     {
 194         // Extra elements means we can use full width-4 load/store operations
 195
 196         t1 = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
 197         t2 = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
 198         t3 = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
 199         t4 = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
 200         t5 = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
 201         t6 = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
 202         t7 = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
 203         t8 = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
 204
 205         if (align % 4 == 0)
 206         {
 207             // alignment is a multiple of 4
 208             _mm_store_ps(base + align * offset[0], _mm_add_ps(_mm_load_ps(base + align * offset[0]), t5));
 209             _mm_store_ps(base + align * offset[1], _mm_add_ps(_mm_load_ps(base + align * offset[1]), t6));
 210             _mm_store_ps(base + align * offset[2], _mm_add_ps(_mm_load_ps(base + align * offset[2]), t7));
 211             _mm_store_ps(base + align * offset[3], _mm_add_ps(_mm_load_ps(base + align * offset[3]), t8));
 212         }
 213         else
 214         {
 215             // alignment >=5, but not a multiple of 4
 216             _mm_storeu_ps(base + align * offset[0],
 217                           _mm_add_ps(_mm_loadu_ps(base + align * offset[0]), t5));
 218             _mm_storeu_ps(base + align * offset[1],
 219                           _mm_add_ps(_mm_loadu_ps(base + align * offset[1]), t6));
 220             _mm_storeu_ps(base + align * offset[2],
 221                           _mm_add_ps(_mm_loadu_ps(base + align * offset[2]), t7));
 222             _mm_storeu_ps(base + align * offset[3],
 223                           _mm_add_ps(_mm_loadu_ps(base + align * offset[3]), t8));
 224         }
 225     }
 226 }
 227
 228 template<int align>
 229 static inline void gmx_simdcall
 230 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 231 {
 232     // This implementation is identical to the increment version, apart from using subtraction instead
 233     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 234
 235     if (align < 4)
 236     {
 237         t5  = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
 238         t6  = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
 239         t7  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
 240         t8  = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
 241         t9  = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
 242         t10 = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
 243
 244         t1 = _mm_load_ss(base + align * offset[0]);
 245         t1 = _mm_loadh_pi(t1, reinterpret_cast<__m64*>(base + align * offset[0] + 1));
 246         t1 = _mm_sub_ps(t1, t7);
 247         _mm_store_ss(base + align * offset[0], t1);
 248         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[0] + 1), t1);
 249
 250         t2 = _mm_load_ss(base + align * offset[1]);
 251         t2 = _mm_loadh_pi(t2, reinterpret_cast<__m64*>(base + align * offset[1] + 1));
 252         t2 = _mm_sub_ps(t2, t8);
 253         _mm_store_ss(base + align * offset[1], t2);
 254         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 1), t2);
 255
 256         t3 = _mm_load_ss(base + align * offset[2]);
 257         t3 = _mm_loadh_pi(t3, reinterpret_cast<__m64*>(base + align * offset[2] + 1));
 258         t3 = _mm_sub_ps(t3, t9);
 259         _mm_store_ss(base + align * offset[2], t3);
 260         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[2] + 1), t3);
 261
 262         t4 = _mm_load_ss(base + align * offset[3]);
 263         t4 = _mm_loadh_pi(t4, reinterpret_cast<__m64*>(base + align * offset[3] + 1));
 264         t4 = _mm_sub_ps(t4, t10);
 265         _mm_store_ss(base + align * offset[3], t4);
 266         _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[3] + 1), t4);
 267     }
 268     else
 269     {
 270         // Extra elements means we can use full width-4 load/store operations
 271
 272         t1 = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
 273         t2 = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
 274         t3 = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
 275         t4 = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
 276         t5 = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
 277         t6 = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
 278         t7 = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
 279         t8 = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
 280
 281         if (align % 4 == 0)
 282         {
 283             // alignment is a multiple of 4
 284             _mm_store_ps(base + align * offset[0], _mm_sub_ps(_mm_load_ps(base + align * offset[0]), t5));
 285             _mm_store_ps(base + align * offset[1], _mm_sub_ps(_mm_load_ps(base + align * offset[1]), t6));
 286             _mm_store_ps(base + align * offset[2], _mm_sub_ps(_mm_load_ps(base + align * offset[2]), t7));
 287             _mm_store_ps(base + align * offset[3], _mm_sub_ps(_mm_load_ps(base + align * offset[3]), t8));
 288         }
 289         else
 290         {
 291             // alignment >=5, but not a multiple of 4
 292             _mm_storeu_ps(base + align * offset[0],
 293                           _mm_sub_ps(_mm_loadu_ps(base + align * offset[0]), t5));
 294             _mm_storeu_ps(base + align * offset[1],
 295                           _mm_sub_ps(_mm_loadu_ps(base + align * offset[1]), t6));
 296             _mm_storeu_ps(base + align * offset[2],
 297                           _mm_sub_ps(_mm_loadu_ps(base + align * offset[2]), t7));
 298             _mm_storeu_ps(base + align * offset[3],
 299                           _mm_sub_ps(_mm_loadu_ps(base + align * offset[3]), t8));
 300         }
 301     }
 302 }
 303
 304 // Override for AVX-128-FMA and higher
 305 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 306 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
 307                                                         SimdFloat* triplets0,
 308                                                         SimdFloat* triplets1,
 309                                                         SimdFloat* triplets2)
 310 {
 311     triplets0->simdInternal_ =
 312             _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(1, 0, 0, 0));
 313     triplets1->simdInternal_ =
 314             _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(2, 2, 1, 1));
 315     triplets2->simdInternal_ =
 316             _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(3, 3, 3, 2));
 317 }
 318 #endif
 319
 320
 321 template<int align>
 322 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
 323                                                              SimdFInt32   offset,
 324                                                              SimdFloat*   v0,
 325                                                              SimdFloat*   v1,
 326                                                              SimdFloat*   v2,
 327                                                              SimdFloat*   v3)
 328 {
 329     // For present-generation x86 CPUs it appears to be faster to simply
 330     // store the SIMD integer to memory and then use the normal load operations.
 331     // This is likely because (a) the extract function is expensive, and (b)
 332     // the alignment scaling can often be done as part of the load instruction
 333     // (which is even cheaper than doing it in SIMD registers).
 334     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 335     _mm_store_si128((__m128i*)ioffset, offset.simdInternal_);
 336     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 337 }
 338
 339 template<int align>
 340 static inline void gmx_simdcall
 341 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 342 {
 343     // For present-generation x86 CPUs it appears to be faster to simply
 344     // store the SIMD integer to memory and then use the normal load operations.
 345     // This is likely because (a) the extract function is expensive, and (b)
 346     // the alignment scaling can often be done as part of the load instruction
 347     // (which is even cheaper than doing it in SIMD registers).
 348     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 349     _mm_store_si128((__m128i*)ioffset, offset.simdInternal_);
 350     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 351 }
 352
 353
 354 template<int align>
 355 static inline void gmx_simdcall
 356 gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 357 {
 358     // For present-generation x86 CPUs it appears to be faster to simply
 359     // store the SIMD integer to memory and then use the normal load operations.
 360     // This is likely because (a) the extract function is expensive, and (b)
 361     // the alignment scaling can often be done as part of the load instruction
 362     // (which is even cheaper than doing it in SIMD registers).
 363     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 364     _mm_store_si128((__m128i*)ioffset, offset.simdInternal_);
 365     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 366 }
 367
 368 // Override for AVX-128-FMA and higher
 369 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 370 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 371 {
 372     _MM_TRANSPOSE4_PS(v0.simdInternal_, v1.simdInternal_, v2.simdInternal_, v3.simdInternal_);
 373     v0.simdInternal_ = _mm_add_ps(v0.simdInternal_, v1.simdInternal_);
 374     v2.simdInternal_ = _mm_add_ps(v2.simdInternal_, v3.simdInternal_);
 375     v0.simdInternal_ = _mm_add_ps(v0.simdInternal_, v2.simdInternal_);
 376     v2.simdInternal_ = _mm_add_ps(v0.simdInternal_, _mm_load_ps(m));
 377
 378     assert(std::size_t(m) % 16 == 0);
 379     _mm_store_ps(m, v2.simdInternal_);
 380
 381     __m128 b = _mm_add_ps(v0.simdInternal_,
 382                           _mm_shuffle_ps(v0.simdInternal_, v0.simdInternal_, _MM_SHUFFLE(1, 0, 3, 2)));
 383     b        = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
 384     return *reinterpret_cast<float*>(&b);
 385 }
 386 #endif
 387
 388 } // namespace gmx
 389
 390 #endif // GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H