src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstddef>
  43 #include <cstdint>
  44
  45 #include <immintrin.h>
  46
  47 #include "gromacs/utility/basedefinitions.h"
  48
  49 #include "impl_x86_avx_256_simd_double.h"
  50
  51 namespace gmx
  52 {
  53
  54 // Internal utility function: Full 4x4 transpose of __m256d
  55 static inline void gmx_simdcall avx256Transpose4By4(__m256d* v0, __m256d* v1, __m256d* v2, __m256d* v3)
  56 {
  57     __m256d t1 = _mm256_unpacklo_pd(*v0, *v1);
  58     __m256d t2 = _mm256_unpackhi_pd(*v0, *v1);
  59     __m256d t3 = _mm256_unpacklo_pd(*v2, *v3);
  60     __m256d t4 = _mm256_unpackhi_pd(*v2, *v3);
  61     *v0        = _mm256_permute2f128_pd(t1, t3, 0x20);
  62     *v1        = _mm256_permute2f128_pd(t2, t4, 0x20);
  63     *v2        = _mm256_permute2f128_pd(t1, t3, 0x31);
  64     *v3        = _mm256_permute2f128_pd(t2, t4, 0x31);
  65 }
  66
  67 template<int align>
  68 static inline void gmx_simdcall gatherLoadTranspose(const double*      base,
  69                                                     const std::int32_t offset[],
  70                                                     SimdDouble*        v0,
  71                                                     SimdDouble*        v1,
  72                                                     SimdDouble*        v2,
  73                                                     SimdDouble*        v3)
  74 {
  75     assert(std::size_t(offset) % 16 == 0);
  76     assert(std::size_t(base) % 32 == 0);
  77     assert(align % 4 == 0);
  78
  79     v0->simdInternal_ = _mm256_load_pd(base + align * offset[0]);
  80     v1->simdInternal_ = _mm256_load_pd(base + align * offset[1]);
  81     v2->simdInternal_ = _mm256_load_pd(base + align * offset[2]);
  82     v3->simdInternal_ = _mm256_load_pd(base + align * offset[3]);
  83     avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
  84 }
  85
  86 template<int align>
  87 static inline void gmx_simdcall
  88                    gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
  89 {
  90     __m128d t1, t2, t3, t4;
  91     __m256d tA, tB;
  92
  93     assert(std::size_t(offset) % 16 == 0);
  94     assert(std::size_t(base) % 16 == 0);
  95     assert(align % 2 == 0);
  96
  97     t1 = _mm_load_pd(base + align * offset[0]);
  98     t2 = _mm_load_pd(base + align * offset[1]);
  99     t3 = _mm_load_pd(base + align * offset[2]);
 100     t4 = _mm_load_pd(base + align * offset[3]);
 101     tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
 102     tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
 103
 104     v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
 105     v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
 106 }
 107
 108 static const int c_simdBestPairAlignmentDouble = 2;
 109
 110 // With the implementation below, thread-sanitizer can detect false positives.
 111 // For loading a triplet, we load 4 floats and ignore the last. Another thread
 112 // might write to this element, but that will not affect the result.
 113 // On AVX2 we can use a gather intrinsic instead.
 114 template<int align>
 115 static inline void gmx_simdcall gatherLoadUTranspose(const double*      base,
 116                                                      const std::int32_t offset[],
 117                                                      SimdDouble*        v0,
 118                                                      SimdDouble*        v1,
 119                                                      SimdDouble*        v2)
 120 {
 121     assert(std::size_t(offset) % 16 == 0);
 122
 123     __m256d t1, t2, t3, t4, t5, t6, t7, t8;
 124     if (align % 4 == 0)
 125     {
 126         t1 = _mm256_load_pd(base + align * offset[0]);
 127         t2 = _mm256_load_pd(base + align * offset[1]);
 128         t3 = _mm256_load_pd(base + align * offset[2]);
 129         t4 = _mm256_load_pd(base + align * offset[3]);
 130     }
 131     else
 132     {
 133         t1 = _mm256_loadu_pd(base + align * offset[0]);
 134         t2 = _mm256_loadu_pd(base + align * offset[1]);
 135         t3 = _mm256_loadu_pd(base + align * offset[2]);
 136         t4 = _mm256_loadu_pd(base + align * offset[3]);
 137     }
 138     t5                = _mm256_unpacklo_pd(t1, t2);
 139     t6                = _mm256_unpackhi_pd(t1, t2);
 140     t7                = _mm256_unpacklo_pd(t3, t4);
 141     t8                = _mm256_unpackhi_pd(t3, t4);
 142     v0->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x20);
 143     v1->simdInternal_ = _mm256_permute2f128_pd(t6, t8, 0x20);
 144     v2->simdInternal_ = _mm256_permute2f128_pd(t5, t7, 0x31);
 145 }
 146
 147 template<int align>
 148 static inline void gmx_simdcall transposeScatterStoreU(double*            base,
 149                                                        const std::int32_t offset[],
 150                                                        SimdDouble         v0,
 151                                                        SimdDouble         v1,
 152                                                        SimdDouble         v2)
 153 {
 154     __m256d t0, t1, t2;
 155
 156
 157     assert(std::size_t(offset) % 16 == 0);
 158
 159     // v0: x0 x1 | x2 x3
 160     // v1: y0 y1 | y2 y3
 161     // v2: z0 z1 | z2 z3
 162
 163     t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
 164     t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
 165     t2 = _mm256_unpackhi_pd(v2.simdInternal_, v2.simdInternal_); // z1 z1 | z3 z3
 166
 167     _mm_storeu_pd(base + align * offset[0], _mm256_castpd256_pd128(t0));
 168     _mm_storeu_pd(base + align * offset[1], _mm256_castpd256_pd128(t1));
 169     _mm_storeu_pd(base + align * offset[2], _mm256_extractf128_pd(t0, 0x1));
 170     _mm_storeu_pd(base + align * offset[3], _mm256_extractf128_pd(t1, 0x1));
 171     _mm_store_sd(base + align * offset[0] + 2, _mm256_castpd256_pd128(v2.simdInternal_));
 172     _mm_store_sd(base + align * offset[1] + 2, _mm256_castpd256_pd128(t2));
 173     _mm_store_sd(base + align * offset[2] + 2, _mm256_extractf128_pd(v2.simdInternal_, 0x1));
 174     _mm_store_sd(base + align * offset[3] + 2, _mm256_extractf128_pd(t2, 0x1));
 175 }
 176
 177 template<int align>
 178 static inline void gmx_simdcall
 179                    transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 180 {
 181     __m256d t0, t1;
 182     __m128d t2, tA, tB;
 183
 184     assert(std::size_t(offset) % 16 == 0);
 185
 186     if (align % 4 == 0)
 187     {
 188         // we can use aligned load/store
 189         t0 = _mm256_setzero_pd();
 190         avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
 191         _mm256_store_pd(base + align * offset[0],
 192                         _mm256_add_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
 193         _mm256_store_pd(base + align * offset[1],
 194                         _mm256_add_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
 195         _mm256_store_pd(base + align * offset[2],
 196                         _mm256_add_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
 197         _mm256_store_pd(base + align * offset[3],
 198                         _mm256_add_pd(_mm256_load_pd(base + align * offset[3]), t0));
 199     }
 200     else
 201     {
 202         // v0: x0 x1 | x2 x3
 203         // v1: y0 y1 | y2 y3
 204         // v2: z0 z1 | z2 z3
 205
 206         t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
 207         t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
 208         t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1);           // z2 z3
 209
 210         tA = _mm_loadu_pd(base + align * offset[0]);
 211         tB = _mm_load_sd(base + align * offset[0] + 2);
 212         tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t0));
 213         tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 214         _mm_storeu_pd(base + align * offset[0], tA);
 215         _mm_store_sd(base + align * offset[0] + 2, tB);
 216
 217         tA = _mm_loadu_pd(base + align * offset[1]);
 218         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
 219         tA = _mm_add_pd(tA, _mm256_castpd256_pd128(t1));
 220         tB = _mm_add_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 221         _mm_storeu_pd(base + align * offset[1], tA);
 222         _mm_storeh_pd(base + align * offset[1] + 2, tB);
 223
 224         tA = _mm_loadu_pd(base + align * offset[2]);
 225         tB = _mm_load_sd(base + align * offset[2] + 2);
 226         tA = _mm_add_pd(tA, _mm256_extractf128_pd(t0, 0x1));
 227         tB = _mm_add_pd(tB, t2);
 228         _mm_storeu_pd(base + align * offset[2], tA);
 229         _mm_store_sd(base + align * offset[2] + 2, tB);
 230
 231         tA = _mm_loadu_pd(base + align * offset[3]);
 232         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
 233         tA = _mm_add_pd(tA, _mm256_extractf128_pd(t1, 0x1));
 234         tB = _mm_add_pd(tB, t2);
 235         _mm_storeu_pd(base + align * offset[3], tA);
 236         _mm_storeh_pd(base + align * offset[3] + 2, tB);
 237     }
 238 }
 239 template<int align>
 240 static inline void gmx_simdcall
 241                    transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 242 {
 243     __m256d t0, t1;
 244     __m128d t2, tA, tB;
 245
 246     assert(std::size_t(offset) % 16 == 0);
 247
 248     if (align % 4 == 0)
 249     {
 250         // we can use aligned load/store
 251         t0 = _mm256_setzero_pd();
 252         avx256Transpose4By4(&v0.simdInternal_, &v1.simdInternal_, &v2.simdInternal_, &t0);
 253         _mm256_store_pd(base + align * offset[0],
 254                         _mm256_sub_pd(_mm256_load_pd(base + align * offset[0]), v0.simdInternal_));
 255         _mm256_store_pd(base + align * offset[1],
 256                         _mm256_sub_pd(_mm256_load_pd(base + align * offset[1]), v1.simdInternal_));
 257         _mm256_store_pd(base + align * offset[2],
 258                         _mm256_sub_pd(_mm256_load_pd(base + align * offset[2]), v2.simdInternal_));
 259         _mm256_store_pd(base + align * offset[3],
 260                         _mm256_sub_pd(_mm256_load_pd(base + align * offset[3]), t0));
 261     }
 262     else
 263     {
 264         // v0: x0 x1 | x2 x3
 265         // v1: y0 y1 | y2 y3
 266         // v2: z0 z1 | z2 z3
 267
 268         t0 = _mm256_unpacklo_pd(v0.simdInternal_, v1.simdInternal_); // x0 y0 | x2 y2
 269         t1 = _mm256_unpackhi_pd(v0.simdInternal_, v1.simdInternal_); // x1 y1 | x3 y3
 270         t2 = _mm256_extractf128_pd(v2.simdInternal_, 0x1);           // z2 z3
 271
 272         tA = _mm_loadu_pd(base + align * offset[0]);
 273         tB = _mm_load_sd(base + align * offset[0] + 2);
 274         tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t0));
 275         tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 276         _mm_storeu_pd(base + align * offset[0], tA);
 277         _mm_store_sd(base + align * offset[0] + 2, tB);
 278
 279         tA = _mm_loadu_pd(base + align * offset[1]);
 280         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[1] + 2);
 281         tA = _mm_sub_pd(tA, _mm256_castpd256_pd128(t1));
 282         tB = _mm_sub_pd(tB, _mm256_castpd256_pd128(v2.simdInternal_));
 283         _mm_storeu_pd(base + align * offset[1], tA);
 284         _mm_storeh_pd(base + align * offset[1] + 2, tB);
 285
 286         tA = _mm_loadu_pd(base + align * offset[2]);
 287         tB = _mm_load_sd(base + align * offset[2] + 2);
 288         tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t0, 0x1));
 289         tB = _mm_sub_pd(tB, t2);
 290         _mm_storeu_pd(base + align * offset[2], tA);
 291         _mm_store_sd(base + align * offset[2] + 2, tB);
 292
 293         tA = _mm_loadu_pd(base + align * offset[3]);
 294         tB = _mm_loadh_pd(_mm_setzero_pd(), base + align * offset[3] + 2);
 295         tA = _mm_sub_pd(tA, _mm256_extractf128_pd(t1, 0x1));
 296         tB = _mm_sub_pd(tB, t2);
 297         _mm_storeu_pd(base + align * offset[3], tA);
 298         _mm_storeh_pd(base + align * offset[3] + 2, tB);
 299     }
 300 }
 301
 302 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble  scalar,
 303                                                         SimdDouble* triplets0,
 304                                                         SimdDouble* triplets1,
 305                                                         SimdDouble* triplets2)
 306 {
 307     __m256d t0 = _mm256_permute2f128_pd(scalar.simdInternal_, scalar.simdInternal_, 0x21);
 308     __m256d t1 = _mm256_permute_pd(scalar.simdInternal_, 0b0000);
 309     __m256d t2 = _mm256_permute_pd(scalar.simdInternal_, 0b1111);
 310     triplets0->simdInternal_ = _mm256_blend_pd(t1, t0, 0b1100);
 311     triplets1->simdInternal_ = _mm256_blend_pd(t2, t1, 0b1100);
 312     triplets2->simdInternal_ = _mm256_blend_pd(t0, t2, 0b1100);
 313 }
 314
 315 template<int align>
 316 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
 317                                                              SimdDInt32    offset,
 318                                                              SimdDouble*   v0,
 319                                                              SimdDouble*   v1,
 320                                                              SimdDouble*   v2,
 321                                                              SimdDouble*   v3)
 322 {
 323     assert(std::size_t(base) % 32 == 0);
 324     assert(align % 4 == 0);
 325
 326     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 327     _mm_store_si128(reinterpret_cast<__m128i*>(ioffset), offset.simdInternal_);
 328
 329     v0->simdInternal_ = _mm256_load_pd(base + align * ioffset[0]);
 330     v1->simdInternal_ = _mm256_load_pd(base + align * ioffset[1]);
 331     v2->simdInternal_ = _mm256_load_pd(base + align * ioffset[2]);
 332     v3->simdInternal_ = _mm256_load_pd(base + align * ioffset[3]);
 333
 334     avx256Transpose4By4(&v0->simdInternal_, &v1->simdInternal_, &v2->simdInternal_, &v3->simdInternal_);
 335 }
 336
 337 template<int align>
 338 static inline void gmx_simdcall
 339                    gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 340 {
 341     __m128d t1, t2, t3, t4;
 342     __m256d tA, tB;
 343
 344     assert(std::size_t(base) % 16 == 0);
 345     assert(align % 2 == 0);
 346
 347     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 348     _mm_store_si128(reinterpret_cast<__m128i*>(ioffset), offset.simdInternal_);
 349
 350     t1 = _mm_load_pd(base + align * ioffset[0]);
 351     t2 = _mm_load_pd(base + align * ioffset[1]);
 352     t3 = _mm_load_pd(base + align * ioffset[2]);
 353     t4 = _mm_load_pd(base + align * ioffset[3]);
 354
 355     tA                = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
 356     tB                = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
 357     v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
 358     v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
 359 }
 360
 361 template<int align>
 362 static inline void gmx_simdcall
 363                    gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 364 {
 365     __m128d t1, t2, t3, t4;
 366     __m256d tA, tB;
 367
 368     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 369     _mm_store_si128(reinterpret_cast<__m128i*>(ioffset), offset.simdInternal_);
 370
 371     t1 = _mm_loadu_pd(base + align * ioffset[0]);
 372     t2 = _mm_loadu_pd(base + align * ioffset[1]);
 373     t3 = _mm_loadu_pd(base + align * ioffset[2]);
 374     t4 = _mm_loadu_pd(base + align * ioffset[3]);
 375
 376     tA = _mm256_insertf128_pd(_mm256_castpd128_pd256(t1), t3, 0x1);
 377     tB = _mm256_insertf128_pd(_mm256_castpd128_pd256(t2), t4, 0x1);
 378
 379     v0->simdInternal_ = _mm256_unpacklo_pd(tA, tB);
 380     v1->simdInternal_ = _mm256_unpackhi_pd(tA, tB);
 381 }
 382
 383 static inline double gmx_simdcall
 384                      reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
 385 {
 386     __m256d t0, t1, t2;
 387     __m128d a0, a1;
 388
 389     assert(std::size_t(m) % 32 == 0);
 390
 391     t0 = _mm256_hadd_pd(v0.simdInternal_, v1.simdInternal_);
 392     t1 = _mm256_hadd_pd(v2.simdInternal_, v3.simdInternal_);
 393     t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
 394     t0 = _mm256_add_pd(t0, t2);
 395     t1 = _mm256_add_pd(t1, t2);
 396     t0 = _mm256_blend_pd(t0, t1, 0b1100);
 397
 398     t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
 399     _mm256_store_pd(m, t1);
 400
 401     t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
 402     a0 = _mm256_castpd256_pd128(t0);
 403     a1 = _mm256_extractf128_pd(t0, 0x1);
 404     a0 = _mm_add_sd(a0, a1);
 405
 406     return *reinterpret_cast<double*>(&a0);
 407 }
 408
 409 } // namespace gmx
 410
 411 #endif // GMX_SIMD_IMPL_X86_AVX_256_UTIL_DOUBLE_H