src/gromacs/simd/impl_x86_sse2/impl_x86_sse2_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H
  36 #define GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H
  37
  38 #include "config.h"
  39
  40 #include <cassert>
  41 #include <cstddef>
  42 #include <cstdint>
  43
  44 #include <emmintrin.h>
  45
  46 #include "gromacs/utility/basedefinitions.h"
  47
  48 #include "impl_x86_sse2_simd_float.h"
  49
  50
  51 namespace gmx
  52 {
  53
  54 template <int align>
  55 static inline void gmx_simdcall
  56 gatherLoadTranspose(const float *        base,
  57                     const std::int32_t   offset[],
  58                     SimdFloat *          v0,
  59                     SimdFloat *          v1,
  60                     SimdFloat *          v2,
  61                     SimdFloat *          v3)
  62 {
  63     assert(std::size_t(base + align * offset[0]) % 16 == 0);
  64     assert(std::size_t(base + align * offset[1]) % 16 == 0);
  65     assert(std::size_t(base + align * offset[2]) % 16 == 0);
  66     assert(std::size_t(base + align * offset[3]) % 16 == 0);
  67
  68     v0->simdInternal_ = _mm_load_ps( base + align * offset[0] );
  69     v1->simdInternal_ = _mm_load_ps( base + align * offset[1] );
  70     v2->simdInternal_ = _mm_load_ps( base + align * offset[2] );
  71     v3->simdInternal_ = _mm_load_ps( base + align * offset[3] );
  72
  73     _MM_TRANSPOSE4_PS(v0->simdInternal_, v1->simdInternal_, v2->simdInternal_, v3->simdInternal_);
  74 }
  75
  76 template <int align>
  77 static inline void gmx_simdcall
  78 gatherLoadTranspose(const float *        base,
  79                     const std::int32_t   offset[],
  80                     SimdFloat *          v0,
  81                     SimdFloat *          v1)
  82 {
  83     __m128 t1, t2;
  84
  85     v0->simdInternal_ = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[0] ) ));
  86     v1->simdInternal_ = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[1] ) ));
  87     t1                = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[2] ) ));
  88     t2                = _mm_castpd_ps(_mm_load_sd( reinterpret_cast<const double *>( base + align * offset[3] ) ));
  89     t1                = _mm_unpacklo_ps(v0->simdInternal_, t1);
  90     t2                = _mm_unpacklo_ps(v1->simdInternal_, t2);
  91     v0->simdInternal_ = _mm_unpacklo_ps(t1, t2);
  92     v1->simdInternal_ = _mm_unpackhi_ps(t1, t2);
  93 }
  94
  95 static const int c_simdBestPairAlignmentFloat = 2;
  96
  97 template <int align>
  98 static inline void gmx_simdcall
  99 gatherLoadUTranspose(const float *        base,
 100                      const std::int32_t   offset[],
 101                      SimdFloat *          v0,
 102                      SimdFloat *          v1,
 103                      SimdFloat *          v2)
 104 {
 105     __m128 t1, t2, t3, t4, t5, t6, t7, t8;
 106
 107     if (align % 4 != 0)
 108     {
 109         // general case, not aligned to 4-byte boundary
 110         t1                = _mm_loadu_ps( base + align * offset[0] );
 111         t2                = _mm_loadu_ps( base + align * offset[1] );
 112         t3                = _mm_loadu_ps( base + align * offset[2] );
 113         t4                = _mm_loadu_ps( base + align * offset[3] );
 114     }
 115     else
 116     {
 117         // aligned to 4-byte boundary or more
 118         t1                = _mm_load_ps( base + align * offset[0] );
 119         t2                = _mm_load_ps( base + align * offset[1] );
 120         t3                = _mm_load_ps( base + align * offset[2] );
 121         t4                = _mm_load_ps( base + align * offset[3] );
 122     }
 123     t5                = _mm_unpacklo_ps(t1, t2);
 124     t6                = _mm_unpacklo_ps(t3, t4);
 125     t7                = _mm_unpackhi_ps(t1, t2);
 126     t8                = _mm_unpackhi_ps(t3, t4);
 127     *v0               = _mm_movelh_ps(t5, t6);
 128     *v1               = _mm_movehl_ps(t6, t5);
 129     *v2               = _mm_movelh_ps(t7, t8);
 130 }
 131
 132
 133 template <int align>
 134 static inline void gmx_simdcall
 135 transposeScatterStoreU(float *              base,
 136                        const std::int32_t   offset[],
 137                        SimdFloat            v0,
 138                        SimdFloat            v1,
 139                        SimdFloat            v2)
 140 {
 141     __m128 t1, t2;
 142
 143     // general case, not aligned to 4-byte boundary
 144     t1   = _mm_unpacklo_ps(v0.simdInternal_, v1.simdInternal_);
 145     t2   = _mm_unpackhi_ps(v0.simdInternal_, v1.simdInternal_);
 146     _mm_storel_pi( reinterpret_cast< __m64 *>( base + align * offset[0] ), t1);
 147     _mm_store_ss(base + align * offset[0] + 2, v2.simdInternal_);
 148     _mm_storeh_pi( reinterpret_cast< __m64 *>( base + align * offset[1] ), t1);
 149     _mm_store_ss(base + align * offset[1] + 2, _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(1, 1, 1, 1)));
 150     _mm_storel_pi( reinterpret_cast< __m64 *>( base + align * offset[2] ), t2);
 151     _mm_store_ss(base + align * offset[2] + 2, _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(2, 2, 2, 2)));
 152     _mm_storeh_pi( reinterpret_cast< __m64 *>( base + align * offset[3] ), t2);
 153     _mm_store_ss(base + align * offset[3] + 2, _mm_shuffle_ps(v2.simdInternal_, v2.simdInternal_, _MM_SHUFFLE(3, 3, 3, 3)));
 154 }
 155
 156
 157 template <int align>
 158 static inline void gmx_simdcall
 159 transposeScatterIncrU(float *              base,
 160                       const std::int32_t   offset[],
 161                       SimdFloat            v0,
 162                       SimdFloat            v1,
 163                       SimdFloat            v2)
 164 {
 165     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 166
 167     if (align < 4)
 168     {
 169         t5          = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
 170         t6          = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
 171         t7          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
 172         t8          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
 173         t9          = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
 174         t10         = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
 175
 176         t1          = _mm_load_ss(base + align * offset[0]);
 177         t1          = _mm_loadh_pi(t1, reinterpret_cast< __m64 *>(base + align * offset[0] + 1));
 178         t1          = _mm_add_ps(t1, t7);
 179         _mm_store_ss(base + align * offset[0], t1);
 180         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[0] + 1), t1);
 181
 182         t2          = _mm_load_ss(base + align * offset[1]);
 183         t2          = _mm_loadh_pi(t2, reinterpret_cast< __m64 *>(base + align * offset[1] + 1));
 184         t2          = _mm_add_ps(t2, t8);
 185         _mm_store_ss(base + align * offset[1], t2);
 186         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[1] + 1), t2);
 187
 188         t3          = _mm_load_ss(base + align * offset[2]);
 189         t3          = _mm_loadh_pi(t3, reinterpret_cast< __m64 *>(base + align * offset[2] + 1));
 190         t3          = _mm_add_ps(t3, t9);
 191         _mm_store_ss(base + align * offset[2], t3);
 192         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[2] + 1), t3);
 193
 194         t4          = _mm_load_ss(base + align * offset[3]);
 195         t4          = _mm_loadh_pi(t4, reinterpret_cast< __m64 *>(base + align * offset[3] + 1));
 196         t4          = _mm_add_ps(t4, t10);
 197         _mm_store_ss(base + align * offset[3], t4);
 198         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[3] + 1), t4);
 199     }
 200     else
 201     {
 202         // Extra elements means we can use full width-4 load/store operations
 203
 204         t1  = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
 205         t2  = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
 206         t3  = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
 207         t4  = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
 208         t5  = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
 209         t6  = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
 210         t7  = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
 211         t8  = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
 212
 213         if (align % 4 == 0)
 214         {
 215             // alignment is a multiple of 4
 216             _mm_store_ps(base + align * offset[0], _mm_add_ps(_mm_load_ps(base + align * offset[0]), t5));
 217             _mm_store_ps(base + align * offset[1], _mm_add_ps(_mm_load_ps(base + align * offset[1]), t6));
 218             _mm_store_ps(base + align * offset[2], _mm_add_ps(_mm_load_ps(base + align * offset[2]), t7));
 219             _mm_store_ps(base + align * offset[3], _mm_add_ps(_mm_load_ps(base + align * offset[3]), t8));
 220         }
 221         else
 222         {
 223             // alignment >=5, but not a multiple of 4
 224             _mm_storeu_ps(base + align * offset[0], _mm_add_ps(_mm_loadu_ps(base + align * offset[0]), t5));
 225             _mm_storeu_ps(base + align * offset[1], _mm_add_ps(_mm_loadu_ps(base + align * offset[1]), t6));
 226             _mm_storeu_ps(base + align * offset[2], _mm_add_ps(_mm_loadu_ps(base + align * offset[2]), t7));
 227             _mm_storeu_ps(base + align * offset[3], _mm_add_ps(_mm_loadu_ps(base + align * offset[3]), t8));
 228         }
 229     }
 230 }
 231
 232 template <int align>
 233 static inline void gmx_simdcall
 234 transposeScatterDecrU(float *              base,
 235                       const std::int32_t   offset[],
 236                       SimdFloat            v0,
 237                       SimdFloat            v1,
 238                       SimdFloat            v2)
 239 {
 240     // This implementation is identical to the increment version, apart from using subtraction instead
 241     __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
 242
 243     if (align < 4)
 244     {
 245         t5          = _mm_unpacklo_ps(v1.simdInternal_, v2.simdInternal_);
 246         t6          = _mm_unpackhi_ps(v1.simdInternal_, v2.simdInternal_);
 247         t7          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(1, 0, 0, 0));
 248         t8          = _mm_shuffle_ps(v0.simdInternal_, t5, _MM_SHUFFLE(3, 2, 0, 1));
 249         t9          = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(1, 0, 0, 2));
 250         t10         = _mm_shuffle_ps(v0.simdInternal_, t6, _MM_SHUFFLE(3, 2, 0, 3));
 251
 252         t1          = _mm_load_ss(base + align * offset[0]);
 253         t1          = _mm_loadh_pi(t1, reinterpret_cast< __m64 *>(base + align * offset[0] + 1));
 254         t1          = _mm_sub_ps(t1, t7);
 255         _mm_store_ss(base + align * offset[0], t1);
 256         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[0] + 1), t1);
 257
 258         t2          = _mm_load_ss(base + align * offset[1]);
 259         t2          = _mm_loadh_pi(t2, reinterpret_cast< __m64 *>(base + align * offset[1] + 1));
 260         t2          = _mm_sub_ps(t2, t8);
 261         _mm_store_ss(base + align * offset[1], t2);
 262         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[1] + 1), t2);
 263
 264         t3          = _mm_load_ss(base + align * offset[2]);
 265         t3          = _mm_loadh_pi(t3, reinterpret_cast< __m64 *>(base + align * offset[2] + 1));
 266         t3          = _mm_sub_ps(t3, t9);
 267         _mm_store_ss(base + align * offset[2], t3);
 268         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[2] + 1), t3);
 269
 270         t4          = _mm_load_ss(base + align * offset[3]);
 271         t4          = _mm_loadh_pi(t4, reinterpret_cast< __m64 *>(base + align * offset[3] + 1));
 272         t4          = _mm_sub_ps(t4, t10);
 273         _mm_store_ss(base + align * offset[3], t4);
 274         _mm_storeh_pi(reinterpret_cast< __m64 *>(base + align * offset[3] + 1), t4);
 275     }
 276     else
 277     {
 278         // Extra elements means we can use full width-4 load/store operations
 279
 280         t1  = _mm_unpacklo_ps(v0.simdInternal_, v2.simdInternal_); // x0 z0 x1 z1
 281         t2  = _mm_unpackhi_ps(v0.simdInternal_, v2.simdInternal_); // x2 z2 x3 z3
 282         t3  = _mm_unpacklo_ps(v1.simdInternal_, _mm_setzero_ps()); // y0  0 y1  0
 283         t4  = _mm_unpackhi_ps(v1.simdInternal_, _mm_setzero_ps()); // y2  0 y3  0
 284         t5  = _mm_unpacklo_ps(t1, t3);                             // x0 y0 z0  0
 285         t6  = _mm_unpackhi_ps(t1, t3);                             // x1 y1 z1  0
 286         t7  = _mm_unpacklo_ps(t2, t4);                             // x2 y2 z2  0
 287         t8  = _mm_unpackhi_ps(t2, t4);                             // x3 y3 z3  0
 288
 289         if (align % 4 == 0)
 290         {
 291             // alignment is a multiple of 4
 292             _mm_store_ps(base + align * offset[0], _mm_sub_ps(_mm_load_ps(base + align * offset[0]), t5));
 293             _mm_store_ps(base + align * offset[1], _mm_sub_ps(_mm_load_ps(base + align * offset[1]), t6));
 294             _mm_store_ps(base + align * offset[2], _mm_sub_ps(_mm_load_ps(base + align * offset[2]), t7));
 295             _mm_store_ps(base + align * offset[3], _mm_sub_ps(_mm_load_ps(base + align * offset[3]), t8));
 296         }
 297         else
 298         {
 299             // alignment >=5, but not a multiple of 4
 300             _mm_storeu_ps(base + align * offset[0], _mm_sub_ps(_mm_loadu_ps(base + align * offset[0]), t5));
 301             _mm_storeu_ps(base + align * offset[1], _mm_sub_ps(_mm_loadu_ps(base + align * offset[1]), t6));
 302             _mm_storeu_ps(base + align * offset[2], _mm_sub_ps(_mm_loadu_ps(base + align * offset[2]), t7));
 303             _mm_storeu_ps(base + align * offset[3], _mm_sub_ps(_mm_loadu_ps(base + align * offset[3]), t8));
 304         }
 305     }
 306 }
 307
 308 // Override for AVX-128-FMA and higher
 309 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 310 static inline void gmx_simdcall
 311 expandScalarsToTriplets(SimdFloat    scalar,
 312                         SimdFloat *  triplets0,
 313                         SimdFloat *  triplets1,
 314                         SimdFloat *  triplets2)
 315 {
 316     triplets0->simdInternal_ = _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(1, 0, 0, 0));
 317     triplets1->simdInternal_ = _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(2, 2, 1, 1));
 318     triplets2->simdInternal_ = _mm_shuffle_ps(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE(3, 3, 3, 2));
 319 }
 320 #endif
 321
 322
 323 template <int align>
 324 static inline void gmx_simdcall
 325 gatherLoadBySimdIntTranspose(const float *  base,
 326                              SimdFInt32     offset,
 327                              SimdFloat *    v0,
 328                              SimdFloat *    v1,
 329                              SimdFloat *    v2,
 330                              SimdFloat *    v3)
 331 {
 332     // For present-generation x86 CPUs it appears to be faster to simply
 333     // store the SIMD integer to memory and then use the normal load operations.
 334     // This is likely because (a) the extract function is expensive, and (b)
 335     // the alignment scaling can often be done as part of the load instruction
 336     // (which is even cheaper than doing it in SIMD registers).
 337     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 338     _mm_store_si128( (__m128i *)ioffset, offset.simdInternal_);
 339     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 340 }
 341
 342 template <int align>
 343 static inline void gmx_simdcall
 344 gatherLoadBySimdIntTranspose(const float *   base,
 345                              SimdFInt32      offset,
 346                              SimdFloat *     v0,
 347                              SimdFloat *     v1)
 348 {
 349     // For present-generation x86 CPUs it appears to be faster to simply
 350     // store the SIMD integer to memory and then use the normal load operations.
 351     // This is likely because (a) the extract function is expensive, and (b)
 352     // the alignment scaling can often be done as part of the load instruction
 353     // (which is even cheaper than doing it in SIMD registers).
 354     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 355     _mm_store_si128( (__m128i *)ioffset, offset.simdInternal_);
 356     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 357 }
 358
 359
 360
 361 template <int align>
 362 static inline void gmx_simdcall
 363 gatherLoadUBySimdIntTranspose(const float *  base,
 364                               SimdFInt32     offset,
 365                               SimdFloat *    v0,
 366                               SimdFloat *    v1)
 367 {
 368     // For present-generation x86 CPUs it appears to be faster to simply
 369     // store the SIMD integer to memory and then use the normal load operations.
 370     // This is likely because (a) the extract function is expensive, and (b)
 371     // the alignment scaling can often be done as part of the load instruction
 372     // (which is even cheaper than doing it in SIMD registers).
 373     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 374     _mm_store_si128( (__m128i *)ioffset, offset.simdInternal_);
 375     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 376 }
 377
 378 // Override for AVX-128-FMA and higher
 379 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 380 static inline float gmx_simdcall
 381 reduceIncr4ReturnSum(float *    m,
 382                      SimdFloat  v0,
 383                      SimdFloat  v1,
 384                      SimdFloat  v2,
 385                      SimdFloat  v3)
 386 {
 387     _MM_TRANSPOSE4_PS(v0.simdInternal_, v1.simdInternal_, v2.simdInternal_, v3.simdInternal_);
 388     v0.simdInternal_ = _mm_add_ps(v0.simdInternal_, v1.simdInternal_);
 389     v2.simdInternal_ = _mm_add_ps(v2.simdInternal_, v3.simdInternal_);
 390     v0.simdInternal_ = _mm_add_ps(v0.simdInternal_, v2.simdInternal_);
 391     v2.simdInternal_ = _mm_add_ps(v0.simdInternal_, _mm_load_ps(m));
 392
 393     assert(std::size_t(m) % 16 == 0);
 394     _mm_store_ps(m, v2.simdInternal_);
 395
 396     __m128 b = _mm_add_ps(v0.simdInternal_, _mm_shuffle_ps(v0.simdInternal_, v0.simdInternal_, _MM_SHUFFLE(1, 0, 3, 2)));
 397     b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
 398     return *reinterpret_cast<float *>(&b);
 399 }
 400 #endif
 401
 402 }      // namespace gmx
 403
 404 #endif // GMX_SIMD_IMPL_X86_SSE2_UTIL_FLOAT_H