src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstdint>
  43
  44 #include <immintrin.h>
  45
  46 #include "gromacs/utility/basedefinitions.h"
  47
  48 #include "impl_x86_mic_simd_double.h"
  49
  50 namespace gmx
  51 {
  52
  53 // On MIC it is better to use scatter operations, so we define the load routines
  54 // that use a SIMD offset variable first.
  55
  56 template <int align>
  57 static inline void gmx_simdcall
  58 gatherLoadBySimdIntTranspose(const double *  base,
  59                              SimdDInt32      simdoffset,
  60                              SimdDouble *    v0,
  61                              SimdDouble *    v1,
  62                              SimdDouble *    v2,
  63                              SimdDouble *    v3)
  64 {
  65     assert((size_t)base % 32 == 0);
  66     assert(align % 4 == 0);
  67
  68     // All instructions might be latency ~4 on MIC, so we use shifts where we
  69     // only need a single instruction (since the shift parameter is an immediate),
  70     // but multiplication otherwise.
  71     if (align == 4)
  72     {
  73         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
  74     }
  75     else if (align == 8)
  76     {
  77         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
  78     }
  79     else
  80     {
  81         simdoffset = simdoffset * SimdDInt32(align);
  82     }
  83
  84     v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
  85     v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
  86     v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
  87     v3->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+3, sizeof(double));
  88 }
  89
  90 template <int align>
  91 static inline void gmx_simdcall
  92 gatherLoadUBySimdIntTranspose(const double *  base,
  93                               SimdDInt32      simdoffset,
  94                               SimdDouble *    v0,
  95                               SimdDouble *    v1)
  96 {
  97     // All instructions might be latency ~4 on MIC, so we use shifts where we
  98     // only need a single instruction (since the shift parameter is an immediate),
  99     // but multiplication otherwise.
 100     if (align == 2)
 101     {
 102         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 1);
 103     }
 104     else if (align == 4)
 105     {
 106         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 107     }
 108     else if (align == 8)
 109     {
 110         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 111     }
 112     else
 113     {
 114         simdoffset = simdoffset * SimdDInt32(align);
 115     }
 116
 117     v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
 118     v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
 119 }
 120
 121 template <int align>
 122 static inline void gmx_simdcall
 123 gatherLoadBySimdIntTranspose(const double *    base,
 124                              SimdDInt32        simdoffset,
 125                              SimdDouble *      v0,
 126                              SimdDouble *      v1)
 127 {
 128     assert(std::size_t(base) % 16 == 0);
 129     assert(align % 2 == 0);
 130     gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
 131 }
 132
 133
 134
 135
 136 template <int align>
 137 static inline void gmx_simdcall
 138 gatherLoadTranspose(const double *        base,
 139                     const std::int32_t    offset[],
 140                     SimdDouble *          v0,
 141                     SimdDouble *          v1,
 142                     SimdDouble *          v2,
 143                     SimdDouble *          v3)
 144 {
 145     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1, v2, v3);
 146 }
 147
 148 template <int align>
 149 static inline void gmx_simdcall
 150 gatherLoadTranspose(const double *        base,
 151                     const std::int32_t    offset[],
 152                     SimdDouble *          v0,
 153                     SimdDouble *          v1)
 154 {
 155     gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1);
 156 }
 157
 158 static const int c_simdBestPairAlignmentDouble = 2;
 159
 160 template <int align>
 161 static inline void gmx_simdcall
 162 gatherLoadUTranspose(const double *        base,
 163                      const std::int32_t    offset[],
 164                      SimdDouble *          v0,
 165                      SimdDouble *          v1,
 166                      SimdDouble *          v2)
 167 {
 168     SimdDInt32 simdoffset;
 169
 170     assert(std::size_t(offset) % 32 == 0);
 171
 172     simdoffset = simdLoad(offset, SimdDInt32Tag());
 173
 174     // All instructions might be latency ~4 on MIC, so we use shifts where we
 175     // only need a single instruction (since the shift parameter is an immediate),
 176     // but multiplication otherwise.
 177     if (align == 4)
 178     {
 179         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 180     }
 181     else if (align == 8)
 182     {
 183         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 184     }
 185     else
 186     {
 187         simdoffset = simdoffset * SimdDInt32(align);
 188     }
 189
 190     v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
 191     v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
 192     v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
 193 }
 194
 195 template <int align>
 196 static inline void gmx_simdcall
 197 transposeScatterStoreU(double *            base,
 198                        const std::int32_t  offset[],
 199                        SimdDouble          v0,
 200                        SimdDouble          v1,
 201                        SimdDouble          v2)
 202 {
 203     SimdDInt32 simdoffset;
 204
 205     assert(std::size_t(offset) % 32 == 0);
 206
 207     simdoffset = simdLoad(offset, SimdDInt32Tag());
 208
 209     // All instructions might be latency ~4 on MIC, so we use shifts where we
 210     // only need a single instruction (since the shift parameter is an immediate),
 211     // but multiplication otherwise.
 212     if (align == 4)
 213     {
 214         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
 215     }
 216     else if (align == 8)
 217     {
 218         simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
 219     }
 220     else
 221     {
 222         simdoffset = simdoffset * SimdDInt32(align);
 223     }
 224
 225     _mm512_i32loscatter_pd(base,   simdoffset.simdInternal_, v0.simdInternal_, sizeof(double));
 226     _mm512_i32loscatter_pd(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double));
 227     _mm512_i32loscatter_pd(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double));
 228 }
 229
 230 template <int align>
 231 static inline void gmx_simdcall
 232 transposeScatterIncrU(double *            base,
 233                       const std::int32_t  offset[],
 234                       SimdDouble          v0,
 235                       SimdDouble          v1,
 236                       SimdDouble          v2)
 237 {
 238     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata0[GMX_SIMD_DOUBLE_WIDTH];
 239     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata1[GMX_SIMD_DOUBLE_WIDTH];
 240     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata2[GMX_SIMD_DOUBLE_WIDTH];
 241
 242     store(rdata0, v0);
 243     store(rdata1, v1);
 244     store(rdata2, v2);
 245
 246     for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
 247     {
 248         base[ align * offset[i] + 0] += rdata0[i];
 249         base[ align * offset[i] + 1] += rdata1[i];
 250         base[ align * offset[i] + 2] += rdata2[i];
 251     }
 252 }
 253
 254 template <int align>
 255 static inline void gmx_simdcall
 256 transposeScatterDecrU(double *            base,
 257                       const std::int32_t  offset[],
 258                       SimdDouble          v0,
 259                       SimdDouble          v1,
 260                       SimdDouble          v2)
 261 {
 262     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata0[GMX_SIMD_DOUBLE_WIDTH];
 263     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata1[GMX_SIMD_DOUBLE_WIDTH];
 264     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata2[GMX_SIMD_DOUBLE_WIDTH];
 265
 266     store(rdata0, v0);
 267     store(rdata1, v1);
 268     store(rdata2, v2);
 269
 270     for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
 271     {
 272         base[ align * offset[i] + 0] -= rdata0[i];
 273         base[ align * offset[i] + 1] -= rdata1[i];
 274         base[ align * offset[i] + 2] -= rdata2[i];
 275     }
 276 }
 277
 278 static inline void gmx_simdcall
 279 expandScalarsToTriplets(SimdDouble    scalar,
 280                         SimdDouble *  triplets0,
 281                         SimdDouble *  triplets1,
 282                         SimdDouble *  triplets2)
 283 {
 284     triplets0->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
 285                                                                            _mm512_castpd_si512(scalar.simdInternal_)));
 286     triplets1->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
 287                                                                            _mm512_castpd_si512(scalar.simdInternal_)));
 288     triplets2->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
 289                                                                            _mm512_castpd_si512(scalar.simdInternal_)));
 290 }
 291
 292
 293 static inline double gmx_simdcall
 294 reduceIncr4ReturnSum(double *    m,
 295                      SimdDouble  v0,
 296                      SimdDouble  v1,
 297                      SimdDouble  v2,
 298                      SimdDouble  v3)
 299 {
 300     double  d;
 301     __m512d t0, t1, t2, t3;
 302
 303     assert(std::size_t(m) % 32 == 0);
 304
 305     t0 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0.simdInternal_, v2.simdInternal_), _MM_SWIZ_REG_BADC);
 306     t2 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2.simdInternal_, v0.simdInternal_);
 307     t1 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1.simdInternal_, v3.simdInternal_), _MM_SWIZ_REG_BADC);
 308     t3 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3.simdInternal_, v1.simdInternal_);
 309     t0 = _mm512_add_pd(t0, t2);
 310     t1 = _mm512_add_pd(t1, t3);
 311
 312     t2 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0, t1), _MM_SWIZ_REG_CDAB);
 313     t3 = _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1, t0);
 314     t2 = _mm512_add_pd(t2, t3);
 315
 316     t2 = _mm512_add_pd(t2, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2), _MM_PERM_BADC)));
 317
 318     t0 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
 319     t0 = _mm512_add_pd(t0, t2);
 320     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t0);
 321
 322     t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_BADC));
 323     t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_CDAB));
 324
 325     _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x01), t2);
 326     return d;
 327 }
 328
 329 static inline SimdDouble gmx_simdcall
 330 loadDualHsimd(const double * m0,
 331               const double * m1)
 332 {
 333     assert(std::size_t(m0) % 32 == 0);
 334     assert(std::size_t(m1) % 32 == 0);
 335
 336     return _mm512_mask_extload_pd(_mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
 337                                   m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
 338 }
 339
 340 static inline SimdDouble gmx_simdcall
 341 loadDuplicateHsimd(const double * m)
 342 {
 343     assert(std::size_t(m) % 32 == 0);
 344
 345     return _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
 346 }
 347
 348 static inline SimdDouble gmx_simdcall
 349 loadU1DualHsimd(const double * m)
 350 {
 351     return _mm512_mask_extload_pd(_mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
 352                                   m+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
 353 }
 354
 355
 356 static inline void gmx_simdcall
 357 storeDualHsimd(double *     m0,
 358                double *     m1,
 359                SimdDouble   a)
 360 {
 361     assert(std::size_t(m0) % 32 == 0);
 362     assert(std::size_t(m1) % 32 == 0);
 363
 364     _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), a.simdInternal_);
 365     _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), a.simdInternal_);
 366 }
 367
 368 static inline void gmx_simdcall
 369 incrDualHsimd(double *     m0,
 370               double *     m1,
 371               SimdDouble   a)
 372 {
 373     assert(std::size_t(m0) % 32 == 0);
 374     assert(std::size_t(m1) % 32 == 0);
 375
 376     __m512d x;
 377
 378     // Update lower half
 379     x = _mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
 380     x = _mm512_add_pd(x, a.simdInternal_);
 381     _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), x);
 382
 383     // Update upper half
 384     x = _mm512_extload_pd(m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
 385     x = _mm512_add_pd(x, a.simdInternal_);
 386     _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
 387 }
 388
 389 static inline void gmx_simdcall
 390 decrHsimd(double *    m,
 391           SimdDouble  a)
 392 {
 393     __m512d t;
 394
 395     assert(std::size_t(m) % 32 == 0);
 396
 397     t               = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
 398     a.simdInternal_ = _mm512_add_pd(a.simdInternal_, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
 399     t               = _mm512_sub_pd(t, a.simdInternal_);
 400     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
 401 }
 402
 403
 404 template <int align>
 405 static inline void gmx_simdcall
 406 gatherLoadTransposeHsimd(const double *       base0,
 407                          const double *       base1,
 408                          const std::int32_t   offset[],
 409                          SimdDouble *         v0,
 410                          SimdDouble *         v1)
 411 {
 412     __m512i  idx0, idx1, idx;
 413     __m512d  tmp1, tmp2;
 414
 415     assert(std::size_t(offset) % 16 == 0);
 416     assert(std::size_t(base0) % 16 == 0);
 417     assert(std::size_t(base1) % 16 == 0);
 418     assert(std::size_t(align) % 2  == 0);
 419
 420     idx0 = _mm512_extload_epi32(offset, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
 421
 422     idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
 423     idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
 424
 425     idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0x00F0), idx1, _MM_PERM_AAAA);
 426
 427     tmp1 = _mm512_i32logather_pd(idx, base0, sizeof(double));
 428     tmp2 = _mm512_i32logather_pd(idx, base1, sizeof(double));
 429
 430     v0->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp1), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2), _MM_PERM_BABA));
 431     v1->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp2), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1), _MM_PERM_DCDC));
 432 }
 433
 434 static inline double gmx_simdcall
 435 reduceIncr4ReturnSumHsimd(double *     m,
 436                           SimdDouble   v0,
 437                           SimdDouble   v1)
 438 {
 439     double   d;
 440     __m512d  t0, t1;
 441
 442     assert(std::size_t(m) % 32 == 0);
 443
 444     t0 = _mm512_add_pd(v0.simdInternal_, _mm512_swizzle_pd(v0.simdInternal_, _MM_SWIZ_REG_BADC));
 445     t0 = _mm512_mask_add_pd(t0, _mm512_int2mask(0xCC), v1.simdInternal_, _mm512_swizzle_pd(v1.simdInternal_, _MM_SWIZ_REG_BADC));
 446     t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
 447     t0 = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(t0), _mm512_int2mask(0xCCCC),
 448                                                       _mm512_castpd_ps(t0), _MM_PERM_DCDC));
 449
 450     t1 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
 451     t1 = _mm512_add_pd(t1, t0);
 452     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t1);
 453
 454     t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_BADC));
 455     t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
 456
 457     _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x03), t0);
 458     return d;
 459 }
 460
 461 }      // namespace gmx
 462
 463 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H