src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstddef>
  43 #include <cstdint>
  44
  45 #include <immintrin.h>
  46
  47 #include "gromacs/utility/basedefinitions.h"
  48
  49 #include "impl_x86_avx_256_simd_double.h"
  50
  51 namespace gmx
  52 {
  53
  54 // Internal utility function: Full 4x4 transpose of __m256d
  55 static inline void gmx_simdcall
  56 avx256Transpose4By4(__m256d * v0,
  57                     __m256d * v1,
  58                     __m256d * v2,
  59                     __m256d * v3)
  60 {
  61     __m256d t1 = _mm256_unpacklo_pd(*v0, *v1);
  62     __m256d t2 = _mm256_unpackhi_pd(*v0, *v1);
  63     __m256d t3 = _mm256_unpacklo_pd(*v2, *v3);
  64     __m256d t4 = _mm256_unpackhi_pd(*v2, *v3);
  65     *v0        = _mm256_permute2f128_pd(t1, t3, 0x20);
  66     *v1        = _mm256_permute2f128_pd(t2, t4, 0x20);
  67     *v2        = _mm256_permute2f128_pd(t1, t3, 0x31);
  68     *v3        = _mm256_permute2f128_pd(t2, t4, 0x31);
  69 }
  70
  71 template <int align>
  72 static inline void gmx_simdcall
  73 gatherLoadTranspose(const double *        base,
  74                     const std::int32_t    offset[],
  75                     SimdDouble *          v0,
  76                     SimdDouble *          v1,
  77                     SimdDouble *          v2,
  78                     SimdDouble *          v3)
  79 {
  80     assert(std::size_t(offset) % 16 == 0);
  81     assert(std::size_t(base) % 32 == 0);
  82     assert(align % 4 == 0);
  83
  84     v0->simdInternal_ = _mm256_load_pd( base + align * offset[0] );
  85     v1->simdInternal_ = _mm256_load_pd( base + align * offset[1] );
  86     v2->simdInternal_ = _mm256_load_pd( base + align * offset[2] );
  87     v3->simdInternal_ = _mm256_load_pd( base + align * offset[3] );
  88     avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
  89 }
  90
  91 template <int align>
  92 static inline void gmx_simdcall
  93 gatherLoadTranspose(const double *        base,
  94                     const std::int32_t    offset[],
  95                     SimdDouble *          v0,
  96                     SimdDouble *          v1)
  97 {
  98     __m128d t1, t2, t3, t4;
  99     __m256d tA, tB;
 100
 101     assert(std::size_t(offset) % 16 == 0);
 102     assert(std::size_t(base) % 16 == 0);
 103     assert(align % 2 == 0);
 104
 105     t1   = _mm_load_pd( base + align * offset[0] );
 106     t2   = _mm_load_pd( base + align * offset[1] );
 107     t3   = _mm_load_pd( base + align * offset[2] );
 108     t4   = _mm_load_pd( base + align * offset[3] );
 109     tA   = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
 110     tB   = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
 111
 112     v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
 113     v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
 114 }
 115
 116 static const int c_simdBestPairAlignmentDouble = 2;
 117
 118 template <int align>
 119 static inline void gmx_simdcall
 120 gatherLoadUTranspose(const double *        base,
 121                      const std::int32_t    offset[],
 122                      SimdDouble *          v0,
 123                      SimdDouble *          v1,
 124                      SimdDouble *          v2)
 125 {
 126     assert(std::size_t(offset) % 16 == 0);
 127
 128     __m256d t1, t2, t3, t4, t5, t6, t7, t8;
 129     if (align % 4 == 0)
 130     {
 131         t1                = _mm256_load_pd(base + align * offset[0]);
 132         t2                = _mm256_load_pd(base + align * offset[1]);
 133         t3                = _mm256_load_pd(base + align * offset[2]);
 134         t4                = _mm256_load_pd(base + align * offset[3]);
 135     }
 136     else
 137     {
 138         t1                = _mm256_loadu_pd(base + align * offset[0]);
 139         t2                = _mm256_loadu_pd(base + align * offset[1]);
 140         t3                = _mm256_loadu_pd(base + align * offset[2]);
 141         t4                = _mm256_loadu_pd(base + align * offset[3]);
 142     }
 143     t5                = _mm256_unpacklo_pd(t1, t2);
 144     t6                = _mm256_unpackhi_pd(t1, t2);
 145     t7                = _mm256_unpacklo_pd(t3, t4);
 146     t8                = _mm256_unpackhi_pd(t3, t4);
 147     v0->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x20);
 148     v1->simdInternal_ = _mm256_permute2f128_pd(t6, t8, 0x20);
 149     v2->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x31);
 150 }
 151
 152 template <int align>
 153 static inline void gmx_simdcall
 154 transposeScatterStoreU(double *            base,
 155                        const std::int32_t  offset[],
 156                        SimdDouble          v0,
 157                        SimdDouble          v1,
 158                        SimdDouble          v2)
 159 {
 160     __m256d t0, t1, t2;
 161
 162
 163     assert(std::size_t(offset) % 16 == 0);
 164
 165     // v0: x0 x1 | x2 x3
 166     // v1: y0 y1 | y2 y3
 167     // v2: z0 z1 | z2 z3
 168
 169     t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
 170     t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
 171     t2 = _mm256_unpackhi_pd(v2.simdInternal_, v2.simdInternal_); // z1 z1 | z3 z3
 172
 173     _mm_storeu_pd(base + align * offset[0], _mm256_castpd256_pd128(t0));
 174     _mm_storeu_pd(base + align * offset[1], _mm256_castpd256_pd128(t1));
 175     _mm_storeu_pd(base + align * offset[2], _mm256_extractf128_pd(t0, 0x1));
 176     _mm_storeu_pd(base + align * offset[3], _mm256_extractf128_pd(t1, 0x1));
 177     _mm_store_sd(base + align * offset[0] + 2, _mm256_castpd256_pd128(v2.simdInternal_));
 178     _mm_store_sd(base + align * offset[1] + 2, _mm256_castpd256_pd128(t2));
 179     _mm_store_sd(base + align * offset[2] + 2, _mm256_extractf128_pd(v2.simdInternal_, 0x1));
 180     _mm_store_sd(base + align * offset[3] + 2, _mm256_extractf128_pd(t2, 0x1));
 181 }
 182
 183 template <int align>
 184 static inline void gmx_simdcall
 185 transposeScatterIncrU(double *            base,
 186                       const std::int32_t  offset[],
 187                       SimdDouble          v0,
 188                       SimdDouble          v1,
 189                       SimdDouble          v2)
 190 {
 191     __m256d t0, t1;
 192     __m128d t2, tA, tB;
 193
 194     assert(std::size_t(offset) % 16 == 0);
 195
 196     if (align % 4 == 0)
 197     {
 198         // we can use aligned load/store
 199         t0 = _mm256_setzero_pd();
 200         avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
 201         _mm256_store_pd(base + align * offset[0], _mm256_add_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
 202         _mm256_store_pd(base + align * offset[1], _mm256_add_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
 203         _mm256_store_pd(base + align * offset[2], _mm256_add_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
 204         _mm256_store_pd(base + align * offset[3], _mm256_add_pd(_mm256_load_pd(base + align * offset[3]), t0));
 205     }
 206     else
 207     {
 208         // v0: x0 x1 | x2 x3
 209         // v1: y0 y1 | y2 y3
 210         // v2: z0 z1 | z2 z3
 211
 212         t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
 213         t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
 214         t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1);           // z2 z3
 215
 216         tA = _mm_loadu_pd(base + align * offset[0]);
 217         tB = _mm_load_sd(base + align * offset[0] + 2);
 218         tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t0));
 219         tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 220         _mm_storeu_pd(base + align * offset[0], tA);
 221         _mm_store_sd(base + align * offset[0] + 2, tB);
 222
 223         tA = _mm_loadu_pd(base + align * offset[1]);
 224         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
 225         tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t1));
 226         tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 227         _mm_storeu_pd(base + align * offset[1], tA);
 228         _mm_storeh_pd(base + align * offset[1] + 2, tB);
 229
 230         tA = _mm_loadu_pd(base + align * offset[2]);
 231         tB = _mm_load_sd(base + align * offset[2] + 2);
 232         tA = _mm_add_pd(tA, _mm256_extractf128_pd(t0, 0x1));
 233         tB = _mm_add_pd(tB, t2);
 234         _mm_storeu_pd(base + align * offset[2], tA);
 235         _mm_store_sd(base + align * offset[2] + 2, tB);
 236
 237         tA = _mm_loadu_pd(base + align * offset[3]);
 238         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
 239         tA = _mm_add_pd(tA, _mm256_extractf128_pd(t1, 0x1));
 240         tB = _mm_add_pd(tB, t2);
 241         _mm_storeu_pd(base + align * offset[3], tA);
 242         _mm_storeh_pd(base + align * offset[3] + 2, tB);
 243     }
 244 }
 245 template <int align>
 246 static inline void gmx_simdcall
 247 transposeScatterDecrU(double *            base,
 248                       const std::int32_t  offset[],
 249                       SimdDouble          v0,
 250                       SimdDouble          v1,
 251                       SimdDouble          v2)
 252 {
 253     __m256d t0, t1;
 254     __m128d t2, tA, tB;
 255
 256     assert(std::size_t(offset) % 16 == 0);
 257
 258     if (align % 4 == 0)
 259     {
 260         // we can use aligned load/store
 261         t0 = _mm256_setzero_pd();
 262         avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
 263         _mm256_store_pd(base + align * offset[0], _mm256_sub_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
 264         _mm256_store_pd(base + align * offset[1], _mm256_sub_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
 265         _mm256_store_pd(base + align * offset[2], _mm256_sub_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
 266         _mm256_store_pd(base + align * offset[3], _mm256_sub_pd(_mm256_load_pd(base + align * offset[3]), t0));
 267     }
 268     else
 269     {
 270         // v0: x0 x1 | x2 x3
 271         // v1: y0 y1 | y2 y3
 272         // v2: z0 z1 | z2 z3
 273
 274         t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
 275         t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
 276         t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1);           // z2 z3
 277
 278         tA = _mm_loadu_pd(base + align * offset[0]);
 279         tB = _mm_load_sd(base + align * offset[0] + 2);
 280         tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t0));
 281         tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 282         _mm_storeu_pd(base + align * offset[0], tA);
 283         _mm_store_sd(base + align * offset[0] + 2, tB);
 284
 285         tA = _mm_loadu_pd(base + align * offset[1]);
 286         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
 287         tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t1));
 288         tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 289         _mm_storeu_pd(base + align * offset[1], tA);
 290         _mm_storeh_pd(base + align * offset[1] + 2, tB);
 291
 292         tA = _mm_loadu_pd(base + align * offset[2]);
 293         tB = _mm_load_sd(base + align * offset[2] + 2);
 294         tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t0, 0x1));
 295         tB = _mm_sub_pd(tB, t2);
 296         _mm_storeu_pd(base + align * offset[2], tA);
 297         _mm_store_sd(base + align * offset[2] + 2, tB);
 298
 299         tA = _mm_loadu_pd(base + align * offset[3]);
 300         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
 301         tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t1, 0x1));
 302         tB = _mm_sub_pd(tB, t2);
 303         _mm_storeu_pd(base + align * offset[3], tA);
 304         _mm_storeh_pd(base + align * offset[3] + 2, tB);
 305     }
 306 }
 307
 308 static inline void gmx_simdcall
 309 expandScalarsToTriplets(SimdDouble    scalar,
 310                         SimdDouble *  triplets0,
 311                         SimdDouble *  triplets1,
 312                         SimdDouble *  triplets2)
 313 {
 314     __m256d t0 = _mm256_permute2f128_pd(scalar.simdInternal_, scalar.simdInternal_, 0x21);
 315     __m256d t1 = _mm256_permute_pd(scalar.simdInternal_, 0b0000);
 316     __m256d t2 = _mm256_permute_pd(scalar.simdInternal_, 0b1111);
 317     triplets0->simdInternal_ = _mm256_blend_pd(t1, t0, 0b1100);
 318     triplets1->simdInternal_ = _mm256_blend_pd(t2, t1, 0b1100);
 319     triplets2->simdInternal_ = _mm256_blend_pd(t0, t2, 0b1100);
 320 }
 321
 322 template <int align>
 323 static inline void gmx_simdcall
 324 gatherLoadBySimdIntTranspose(const double *  base,
 325                              SimdDInt32      offset,
 326                              SimdDouble *    v0,
 327                              SimdDouble *    v1,
 328                              SimdDouble *    v2,
 329                              SimdDouble *    v3)
 330 {
 331     assert(std::size_t(base) % 32 == 0);
 332     assert(align % 4 == 0);
 333
 334     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 335     _mm_store_si128( reinterpret_cast<__m128i *>(ioffset), offset.simdInternal_);
 336
 337     v0->simdInternal_ = _mm256_load_pd(base + align * ioffset[0]);
 338     v1->simdInternal_ = _mm256_load_pd(base + align * ioffset[1]);
 339     v2->simdInternal_ = _mm256_load_pd(base + align * ioffset[2]);
 340     v3->simdInternal_ = _mm256_load_pd(base + align * ioffset[3]);
 341
 342     avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
 343 }
 344
 345 template <int align>
 346 static inline void gmx_simdcall
 347 gatherLoadBySimdIntTranspose(const double *    base,
 348                              SimdDInt32        offset,
 349                              SimdDouble *      v0,
 350                              SimdDouble *      v1)
 351 {
 352     __m128d t1, t2, t3, t4;
 353     __m256d tA, tB;
 354
 355     assert(std::size_t(base) % 16 == 0);
 356     assert(align % 2 == 0);
 357
 358     alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ioffset[GMX_SIMD_DINT32_WIDTH];
 359     _mm_store_si128( reinterpret_cast<__m128i *>(ioffset), offset.simdInternal_);
 360
 361     t1  = _mm_load_pd(base + align * ioffset[0]);
 362     t2  = _mm_load_pd(base + align * ioffset[1]);
 363     t3  = _mm_load_pd(base + align * ioffset[2]);
 364     t4  = _mm_load_pd(base + align * ioffset[3]);
 365
 366     tA                = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
 367     tB                = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
 368     v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
 369     v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
 370 }
 371
 372 template <int align>
 373 static inline void gmx_simdcall
 374 gatherLoadUBySimdIntTranspose(const double *  base,
 375                               SimdDInt32      offset,
 376                               SimdDouble *    v0,
 377                               SimdDouble *    v1)
 378 {
 379     __m128d t1, t2, t3, t4;
 380     __m256d tA, tB;
 381
 382     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 383     _mm_store_si128( reinterpret_cast<__m128i *>(ioffset), offset.simdInternal_);
 384
 385     t1   = _mm_loadu_pd(base + align * ioffset[0]);
 386     t2   = _mm_loadu_pd(base + align * ioffset[1]);
 387     t3   = _mm_loadu_pd(base + align * ioffset[2]);
 388     t4   = _mm_loadu_pd(base + align * ioffset[3]);
 389
 390     tA  = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
 391     tB  = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
 392
 393     v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
 394     v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
 395 }
 396
 397 static inline double gmx_simdcall
 398 reduceIncr4ReturnSum(double *    m,
 399                      SimdDouble  v0,
 400                      SimdDouble  v1,
 401                      SimdDouble  v2,
 402                      SimdDouble  v3)
 403 {
 404     __m256d t0, t1, t2;
 405     __m128d a0, a1;
 406
 407     assert(std::size_t(m) % 32 == 0);
 408
 409     t0 = _mm256_hadd_pd(v0.simdInternal_, v1.simdInternal_);
 410     t1 = _mm256_hadd_pd(v2.simdInternal_, v3.simdInternal_);
 411     t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
 412     t0 = _mm256_add_pd(t0, t2);
 413     t1 = _mm256_add_pd(t1, t2);
 414     t0 = _mm256_blend_pd(t0, t1, 0b1100);
 415
 416     t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
 417     _mm256_store_pd(m, t1);
 418
 419     t0  = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101 ));
 420     a0  = _mm256_castpd256_pd128(t0);
 421     a1  = _mm256_extractf128_pd(t0, 0x1);
 422     a0  = _mm_add_sd(a0, a1);
 423
 424     return *reinterpret_cast<double *>(&a0);
 425 }
 426
 427 }      // namespace gmx
 428
 429 #endif // GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H