src/gromacs/simd/impl_x86_sse2/impl_x86_sse2_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2019,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_X86_SSE2_UTIL_DOUBLE_H
  36 #define GMX_SIMD_IMPL_X86_SSE2_UTIL_DOUBLE_H
  37
  38 #include "config.h"
  39
  40 #include <cassert>
  41 #include <cstddef>
  42 #include <cstdint>
  43
  44 #include <emmintrin.h>
  45
  46 #include "impl_x86_sse2_simd_double.h"
  47
  48 namespace gmx
  49 {
  50
  51 template<int align>
  52 static inline void gmx_simdcall gatherLoadTranspose(const double*      base,
  53                                                     const std::int32_t offset[],
  54                                                     SimdDouble*        v0,
  55                                                     SimdDouble*        v1,
  56                                                     SimdDouble*        v2,
  57                                                     SimdDouble*        v3)
  58 {
  59     __m128d t1, t2, t3, t4;
  60
  61     assert(std::size_t(base + align * offset[0]) % 16 == 0);
  62     assert(std::size_t(base + align * offset[1]) % 16 == 0);
  63
  64     t1                = _mm_load_pd(base + align * offset[0]);
  65     t2                = _mm_load_pd(base + align * offset[1]);
  66     t3                = _mm_load_pd(base + align * offset[0] + 2);
  67     t4                = _mm_load_pd(base + align * offset[1] + 2);
  68     v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
  69     v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
  70     v2->simdInternal_ = _mm_unpacklo_pd(t3, t4);
  71     v3->simdInternal_ = _mm_unpackhi_pd(t3, t4);
  72 }
  73
  74 template<int align>
  75 static inline void gmx_simdcall
  76 gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
  77 {
  78     __m128d t1, t2;
  79
  80     assert(std::size_t(base + align * offset[0]) % 16 == 0);
  81     assert(std::size_t(base + align * offset[1]) % 16 == 0);
  82
  83     t1                = _mm_load_pd(base + align * offset[0]);
  84     t2                = _mm_load_pd(base + align * offset[1]);
  85     v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
  86     v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
  87 }
  88
  89 static const int c_simdBestPairAlignmentDouble = 2;
  90
  91 template<int align>
  92 static inline void gmx_simdcall gatherLoadUTranspose(const double*      base,
  93                                                      const std::int32_t offset[],
  94                                                      SimdDouble*        v0,
  95                                                      SimdDouble*        v1,
  96                                                      SimdDouble*        v2)
  97 {
  98     __m128d t1, t2, t3, t4;
  99     t1                = _mm_loadu_pd(base + align * offset[0]);
 100     t2                = _mm_loadu_pd(base + align * offset[1]);
 101     t3                = _mm_load_sd(base + align * offset[0] + 2);
 102     t4                = _mm_load_sd(base + align * offset[1] + 2);
 103     v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
 104     v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
 105     v2->simdInternal_ = _mm_unpacklo_pd(t3, t4);
 106 }
 107
 108 template<int align>
 109 static inline void gmx_simdcall transposeScatterStoreU(double*            base,
 110                                                        const std::int32_t offset[],
 111                                                        SimdDouble         v0,
 112                                                        SimdDouble         v1,
 113                                                        SimdDouble         v2)
 114 {
 115     __m128d t1, t2;
 116     t1 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
 117     t2 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
 118     _mm_storeu_pd(base + align * offset[0], t1);
 119     _mm_store_sd(base + align * offset[0] + 2, v2.simdInternal_);
 120     _mm_storeu_pd(base + align * offset[1], t2);
 121     _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 2), _mm_castpd_ps(v2.simdInternal_));
 122 }
 123
 124 template<int align>
 125 static inline void gmx_simdcall
 126 transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 127 {
 128     __m128d t1, t2, t3, t4, t5, t6, t7;
 129
 130     t5 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
 131     t6 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
 132     t7 = _mm_unpackhi_pd(v2.simdInternal_, v2.simdInternal_);
 133
 134     t1 = _mm_loadu_pd(base + align * offset[0]);
 135     t2 = _mm_load_sd(base + align * offset[0] + 2);
 136     t1 = _mm_add_pd(t1, t5);
 137     t2 = _mm_add_sd(t2, v2.simdInternal_);
 138     _mm_storeu_pd(base + align * offset[0], t1);
 139     _mm_store_sd(base + align * offset[0] + 2, t2);
 140
 141     t3 = _mm_loadu_pd(base + align * offset[1]);
 142     t4 = _mm_load_sd(base + align * offset[1] + 2);
 143     t3 = _mm_add_pd(t3, t6);
 144     t4 = _mm_add_sd(t4, t7);
 145     _mm_storeu_pd(base + align * offset[1], t3);
 146     _mm_store_sd(base + align * offset[1] + 2, t4);
 147 }
 148
 149 template<int align>
 150 static inline void gmx_simdcall
 151 transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 152 {
 153     // This implementation is identical to the increment version, apart from using subtraction instead
 154     __m128d t1, t2, t3, t4, t5, t6, t7;
 155
 156     t5 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
 157     t6 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
 158     t7 = _mm_unpackhi_pd(v2.simdInternal_, v2.simdInternal_);
 159
 160     t1 = _mm_loadu_pd(base + align * offset[0]);
 161     t2 = _mm_load_sd(base + align * offset[0] + 2);
 162     t1 = _mm_sub_pd(t1, t5);
 163     t2 = _mm_sub_sd(t2, v2.simdInternal_);
 164     _mm_storeu_pd(base + align * offset[0], t1);
 165     _mm_store_sd(base + align * offset[0] + 2, t2);
 166
 167     t3 = _mm_loadu_pd(base + align * offset[1]);
 168     t4 = _mm_load_sd(base + align * offset[1] + 2);
 169     t3 = _mm_sub_pd(t3, t6);
 170     t4 = _mm_sub_sd(t4, t7);
 171     _mm_storeu_pd(base + align * offset[1], t3);
 172     _mm_store_sd(base + align * offset[1] + 2, t4);
 173 }
 174
 175 // Override for AVX-128-FMA and higher
 176 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 177 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble  scalar,
 178                                                         SimdDouble* triplets0,
 179                                                         SimdDouble* triplets1,
 180                                                         SimdDouble* triplets2)
 181 {
 182     triplets0->simdInternal_ =
 183             _mm_shuffle_pd(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE2(0, 0));
 184     triplets1->simdInternal_ =
 185             _mm_shuffle_pd(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE2(1, 0));
 186     triplets2->simdInternal_ =
 187             _mm_shuffle_pd(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE2(1, 1));
 188 }
 189 #endif
 190
 191
 192 template<int align>
 193 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
 194                                                              SimdDInt32    offset,
 195                                                              SimdDouble*   v0,
 196                                                              SimdDouble*   v1,
 197                                                              SimdDouble*   v2,
 198                                                              SimdDouble*   v3)
 199 {
 200     __m128d t1, t2, t3, t4;
 201     // Use optimized bit-shift multiply for the most common alignments
 202     if (align == 4)
 203     {
 204         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
 205     }
 206     else if (align == 8)
 207     {
 208         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 3);
 209     }
 210     else if (align == 12)
 211     {
 212         /* multiply by 3, then by 4 */
 213         offset.simdInternal_ =
 214                 _mm_add_epi32(offset.simdInternal_, _mm_slli_epi32(offset.simdInternal_, 1));
 215         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
 216     }
 217     else if (align == 16)
 218     {
 219         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 4);
 220     }
 221
 222     if (align == 4 || align == 8 || align == 12 || align == 16)
 223     {
 224         assert(std::size_t(base + extract<0>(offset)) % 16 == 0);
 225         assert(std::size_t(base + extract<1>(offset)) % 16 == 0);
 226
 227         t1 = _mm_load_pd(base + extract<0>(offset));
 228         t2 = _mm_load_pd(base + extract<1>(offset));
 229         t3 = _mm_load_pd(base + extract<0>(offset) + 2);
 230         t4 = _mm_load_pd(base + extract<1>(offset) + 2);
 231     }
 232     else
 233     {
 234         assert(std::size_t(base + align * extract<0>(offset)) % 16 == 0);
 235         assert(std::size_t(base + align * extract<1>(offset)) % 16 == 0);
 236
 237         t1 = _mm_load_pd(base + align * extract<0>(offset));
 238         t2 = _mm_load_pd(base + align * extract<1>(offset));
 239         t3 = _mm_load_pd(base + align * extract<0>(offset) + 2);
 240         t4 = _mm_load_pd(base + align * extract<1>(offset) + 2);
 241     }
 242     v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
 243     v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
 244     v2->simdInternal_ = _mm_unpacklo_pd(t3, t4);
 245     v3->simdInternal_ = _mm_unpackhi_pd(t3, t4);
 246 }
 247
 248 template<int align>
 249 static inline void gmx_simdcall
 250 gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 251 {
 252     __m128d t1, t2;
 253
 254     // Use optimized bit-shift multiply for the most common alignments
 255     if (align == 2)
 256     {
 257         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 1);
 258     }
 259     else if (align == 4)
 260     {
 261         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
 262     }
 263     else if (align == 6)
 264     {
 265         // multiply by 3, then by 2
 266         offset.simdInternal_ =
 267                 _mm_add_epi32(offset.simdInternal_, _mm_slli_epi32(offset.simdInternal_, 1));
 268         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 1);
 269     }
 270     else if (align == 8)
 271     {
 272         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 3);
 273     }
 274     else if (align == 12)
 275     {
 276         // multiply by 3, then by 4
 277         offset.simdInternal_ =
 278                 _mm_add_epi32(offset.simdInternal_, _mm_slli_epi32(offset.simdInternal_, 1));
 279         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
 280     }
 281     else if (align == 16)
 282     {
 283         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 4);
 284     }
 285
 286     if (align == 2 || align == 4 || align == 6 || align == 8 || align == 12 || align == 16)
 287     {
 288         assert(std::size_t(base + extract<0>(offset)) % 16 == 0);
 289         assert(std::size_t(base + extract<1>(offset)) % 16 == 0);
 290
 291         t1 = _mm_load_pd(base + extract<0>(offset));
 292         t2 = _mm_load_pd(base + extract<1>(offset));
 293     }
 294     else
 295     {
 296         assert(std::size_t(base + align * extract<0>(offset)) % 16 == 0);
 297         assert(std::size_t(base + align * extract<1>(offset)) % 16 == 0);
 298
 299         t1 = _mm_load_pd(base + align * extract<0>(offset));
 300         t2 = _mm_load_pd(base + align * extract<1>(offset));
 301     }
 302     v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
 303     v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
 304 }
 305
 306
 307 template<int align>
 308 static inline void gmx_simdcall
 309 gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 310 {
 311     __m128d t1, t2;
 312     // Use optimized bit-shift multiply for the most common alignments.
 313
 314     // Do nothing for align == 1
 315     if (align == 2)
 316     {
 317         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 1);
 318     }
 319     else if (align == 4)
 320     {
 321         offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
 322     }
 323
 324     if (align == 1 || align == 2 || align == 4)
 325     {
 326         t1 = _mm_loadu_pd(base + extract<0>(offset));
 327         t2 = _mm_loadu_pd(base + extract<1>(offset));
 328     }
 329     else
 330     {
 331         t1 = _mm_loadu_pd(base + align * extract<0>(offset));
 332         t2 = _mm_loadu_pd(base + align * extract<1>(offset));
 333     }
 334     v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
 335     v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
 336 }
 337
 338 // Override for AVX-128-FMA and higher
 339 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 340 static inline double gmx_simdcall
 341 reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
 342 {
 343     __m128d t1, t2, t3, t4;
 344
 345     t1 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
 346     t2 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
 347     t3 = _mm_unpacklo_pd(v2.simdInternal_, v3.simdInternal_);
 348     t4 = _mm_unpackhi_pd(v2.simdInternal_, v3.simdInternal_);
 349
 350     t1 = _mm_add_pd(t1, t2);
 351     t3 = _mm_add_pd(t3, t4);
 352
 353     t2 = _mm_add_pd(t1, _mm_load_pd(m));
 354     t4 = _mm_add_pd(t3, _mm_load_pd(m + 2));
 355
 356     assert(std::size_t(m) % 16 == 0);
 357
 358     _mm_store_pd(m, t2);
 359     _mm_store_pd(m + 2, t4);
 360
 361     t1 = _mm_add_pd(t1, t3);
 362
 363     t2 = _mm_add_sd(t1, _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1)));
 364     return *reinterpret_cast<double*>(&t2);
 365 }
 366 #endif
 367
 368 } // namespace gmx
 369
 370 #endif // GMX_SIMD_IMPL_X86_SSE2_UTIL_DOUBLE_H