src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
  37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include "gromacs/utility/basedefinitions.h"
  42
  43 #include "impl_ibm_vsx_definitions.h"
  44 #include "impl_ibm_vsx_simd_float.h"
  45
  46 namespace gmx
  47 {
  48
  49 template<int align>
  50 static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
  51                                                     const std::int32_t offset[],
  52                                                     SimdFloat*         v0,
  53                                                     SimdFloat*         v1,
  54                                                     SimdFloat*         v2,
  55                                                     SimdFloat*         v3)
  56 {
  57     __vector float l0, l1, l2, l3;
  58
  59     l0 = simdLoad(base + align * offset[0]).simdInternal_;
  60     l1 = simdLoad(base + align * offset[1]).simdInternal_;
  61     l2 = simdLoad(base + align * offset[2]).simdInternal_;
  62     l3 = simdLoad(base + align * offset[3]).simdInternal_;
  63
  64     __vector float t0 = vec_mergeh(l0, l2);
  65     __vector float t1 = vec_mergel(l0, l2);
  66     __vector float t2 = vec_mergeh(l1, l3);
  67     __vector float t3 = vec_mergel(l1, l3);
  68     v0->simdInternal_ = vec_mergeh(t0, t2);
  69     v1->simdInternal_ = vec_mergel(t0, t2);
  70     v2->simdInternal_ = vec_mergeh(t1, t3);
  71     v3->simdInternal_ = vec_mergel(t1, t3);
  72 }
  73
  74 template<int align>
  75 static inline void gmx_simdcall
  76                    gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
  77 {
  78     __vector float t0, t1, t2, t3;
  79
  80     t0 = reinterpret_cast<__vector float>(
  81             vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
  82     t1 = reinterpret_cast<__vector float>(
  83             vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
  84     t2 = reinterpret_cast<__vector float>(
  85             vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
  86     t3 = reinterpret_cast<__vector float>(
  87             vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
  88     t0                = vec_mergeh(t0, t2);
  89     t1                = vec_mergeh(t1, t3);
  90     v0->simdInternal_ = vec_mergeh(t0, t1);
  91     v1->simdInternal_ = vec_mergel(t0, t1);
  92 }
  93
  94 static const int c_simdBestPairAlignmentFloat = 2;
  95
  96 template<int align>
  97 static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
  98                                                      const std::int32_t offset[],
  99                                                      SimdFloat*         v0,
 100                                                      SimdFloat*         v1,
 101                                                      SimdFloat*         v2)
 102 {
 103
 104     if (align % 4 == 0)
 105     {
 106         SimdFloat t3;
 107         gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
 108     }
 109     else
 110     {
 111         __vector float               t1, t2, t3, t4, t5, t6, t7, t8;
 112         const __vector unsigned char perm_lo2hi = { 0,  1,  2,  3,  4,  5,  6,  7,
 113                                                     16, 17, 18, 19, 20, 21, 22, 23 };
 114         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
 115                                                     8,  9,  10, 11, 12, 13, 14, 15 };
 116
 117         t1 = reinterpret_cast<__vector float>(
 118                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
 119         t2 = reinterpret_cast<__vector float>(
 120                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
 121         t3 = reinterpret_cast<__vector float>(
 122                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
 123         t4 = reinterpret_cast<__vector float>(
 124                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
 125         t5 = vec_splats(*(base + align * offset[0] + 2));
 126         t6 = vec_splats(*(base + align * offset[1] + 2));
 127         t7 = vec_splats(*(base + align * offset[2] + 2));
 128         t8 = vec_splats(*(base + align * offset[3] + 2));
 129
 130         t1                = vec_mergeh(t1, t2);
 131         t3                = vec_mergeh(t3, t4);
 132         v0->simdInternal_ = vec_perm(t1, t3, perm_lo2hi);
 133         v1->simdInternal_ = vec_perm(t3, t1, perm_hi2lo);
 134         t5                = vec_mergeh(t5, t6);
 135         t7                = vec_mergeh(t7, t8);
 136         v2->simdInternal_ = vec_perm(t5, t7, perm_lo2hi);
 137     }
 138 }
 139
 140
 141 // gcc-4.9 does not recognize that the argument to vec_extract() is used
 142 template<int align>
 143 static inline void gmx_simdcall transposeScatterStoreU(float*             base,
 144                                                        const std::int32_t offset[],
 145                                                        SimdFloat          v0,
 146                                                        SimdFloat          v1,
 147                                                        SimdFloat gmx_unused v2)
 148 {
 149     __vector float t1, t2;
 150
 151     t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 152     t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 153     *reinterpret_cast<double*>(base + align * offset[0]) =
 154             vec_extract(reinterpret_cast<__vector double>(t1), 0);
 155     base[align * offset[0] + 2] = vec_extract(v2.simdInternal_, 0);
 156     *reinterpret_cast<double*>(base + align * offset[1]) =
 157             vec_extract(reinterpret_cast<__vector double>(t1), 1);
 158     base[align * offset[1] + 2] = vec_extract(v2.simdInternal_, 1);
 159     *reinterpret_cast<double*>(base + align * offset[2]) =
 160             vec_extract(reinterpret_cast<__vector double>(t2), 0);
 161     base[align * offset[2] + 2] = vec_extract(v2.simdInternal_, 2);
 162     *reinterpret_cast<double*>(base + align * offset[3]) =
 163             vec_extract(reinterpret_cast<__vector double>(t2), 1);
 164     base[align * offset[3] + 2] = vec_extract(v2.simdInternal_, 3);
 165 }
 166
 167 template<int align>
 168 static inline void gmx_simdcall
 169                    transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 170 {
 171     if (align < 4)
 172     {
 173         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
 174                                                     8,  9,  10, 11, 12, 13, 14, 15 };
 175         __vector float               t0, t1, t2, t3, t4, t5, t6, t7;
 176
 177         t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
 178         t1 = vec_perm(t0, t0, perm_hi2lo);                   // x1 y1
 179         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
 180         t3 = vec_perm(t2, t2, perm_hi2lo);                   // x3 y3
 181
 182         t4 = reinterpret_cast<__vector float>(
 183                 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
 184         t4 = vec_add(t4, t0);
 185         *reinterpret_cast<double*>(base + align * offset[0]) =
 186                 vec_extract(reinterpret_cast<__vector double>(t4), 0);
 187         {
 188             float extracted = vec_extract(v2.simdInternal_, 0);
 189             base[align * offset[0] + 2] += extracted;
 190         }
 191
 192         t5 = reinterpret_cast<__vector float>(
 193                 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
 194         t5 = vec_add(t5, t1);
 195         *reinterpret_cast<double*>(base + align * offset[1]) =
 196                 vec_extract(reinterpret_cast<__vector double>(t5), 0);
 197         {
 198             float extracted = vec_extract(v2.simdInternal_, 1);
 199             base[align * offset[1] + 2] += extracted;
 200         }
 201
 202         t6 = reinterpret_cast<__vector float>(
 203                 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
 204         t6 = vec_add(t6, t2);
 205         *reinterpret_cast<double*>(base + align * offset[2]) =
 206                 vec_extract(reinterpret_cast<__vector double>(t6), 0);
 207         {
 208             float extracted = vec_extract(v2.simdInternal_, 2);
 209             base[align * offset[2] + 2] += extracted;
 210         }
 211
 212         t7 = reinterpret_cast<__vector float>(
 213                 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
 214         t7 = vec_add(t7, t3);
 215         *reinterpret_cast<double*>(base + align * offset[3]) =
 216                 vec_extract(reinterpret_cast<__vector double>(t7), 0);
 217         {
 218             float extracted = vec_extract(v2.simdInternal_, 3);
 219             base[align * offset[3] + 2] += extracted;
 220         }
 221     }
 222     else
 223     {
 224         // Extra elements means we can use full width-4 load/store operations
 225         SimdFloat      v3;
 226         __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 227         __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 228         __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
 229         __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
 230         v0.simdInternal_  = vec_mergeh(t0, t2);
 231         v1.simdInternal_  = vec_mergel(t0, t2);
 232         v2.simdInternal_  = vec_mergeh(t1, t3);
 233         v3.simdInternal_  = vec_mergel(t1, t3);
 234
 235         store(base + align * offset[0], simdLoad(base + align * offset[0]) + v0);
 236         store(base + align * offset[1], simdLoad(base + align * offset[1]) + v1);
 237         store(base + align * offset[2], simdLoad(base + align * offset[2]) + v2);
 238         store(base + align * offset[3], simdLoad(base + align * offset[3]) + v3);
 239     }
 240 }
 241
 242 template<int align>
 243 static inline void gmx_simdcall
 244                    transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 245 {
 246     if (align < 4)
 247     {
 248         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
 249                                                     8,  9,  10, 11, 12, 13, 14, 15 };
 250         __vector float               t0, t1, t2, t3, t4, t5, t6, t7;
 251
 252         t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
 253         t1 = vec_perm(t0, t0, perm_hi2lo);                   // x1 y1
 254         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
 255         t3 = vec_perm(t2, t2, perm_hi2lo);                   // x3 y3
 256
 257         t4 = reinterpret_cast<__vector float>(
 258                 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
 259         t4 = vec_sub(t4, t0);
 260         *reinterpret_cast<double*>(base + align * offset[0]) =
 261                 vec_extract(reinterpret_cast<__vector double>(t4), 0);
 262         {
 263             float extracted = vec_extract(v2.simdInternal_, 0);
 264             base[align * offset[0] + 2] -= extracted;
 265         }
 266
 267         t5 = reinterpret_cast<__vector float>(
 268                 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
 269         t5 = vec_sub(t5, t1);
 270         *reinterpret_cast<double*>(base + align * offset[1]) =
 271                 vec_extract(reinterpret_cast<__vector double>(t5), 0);
 272         {
 273             float extracted = vec_extract(v2.simdInternal_, 1);
 274             base[align * offset[1] + 2] -= extracted;
 275         }
 276
 277         t6 = reinterpret_cast<__vector float>(
 278                 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
 279         t6 = vec_sub(t6, t2);
 280         *reinterpret_cast<double*>(base + align * offset[2]) =
 281                 vec_extract(reinterpret_cast<__vector double>(t6), 0);
 282         {
 283             float extracted = vec_extract(v2.simdInternal_, 2);
 284             base[align * offset[2] + 2] -= extracted;
 285         }
 286
 287         t7 = reinterpret_cast<__vector float>(
 288                 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
 289         t7 = vec_sub(t7, t3);
 290         *reinterpret_cast<double*>(base + align * offset[3]) =
 291                 vec_extract(reinterpret_cast<__vector double>(t7), 0);
 292         {
 293             float extracted = vec_extract(v2.simdInternal_, 3);
 294             base[align * offset[3] + 2] -= extracted;
 295         }
 296     }
 297     else
 298     {
 299         // Extra elements means we can use full width-4 load/store operations
 300         SimdFloat      v3;
 301         __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 302         __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 303         __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
 304         __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
 305         v0.simdInternal_  = vec_mergeh(t0, t2);
 306         v1.simdInternal_  = vec_mergel(t0, t2);
 307         v2.simdInternal_  = vec_mergeh(t1, t3);
 308         v3.simdInternal_  = vec_mergel(t1, t3);
 309
 310         store(base + align * offset[0], simdLoad(base + align * offset[0]) - v0);
 311         store(base + align * offset[1], simdLoad(base + align * offset[1]) - v1);
 312         store(base + align * offset[2], simdLoad(base + align * offset[2]) - v2);
 313         store(base + align * offset[3], simdLoad(base + align * offset[3]) - v3);
 314     }
 315 }
 316
 317 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
 318                                                         SimdFloat* triplets0,
 319                                                         SimdFloat* triplets1,
 320                                                         SimdFloat* triplets2)
 321 {
 322     // These permutes will be translated to immediate permutes (xxpermdi)
 323     // since they operate on doublewords, which will be faster than loading
 324     // the constants required for fully flexible permutes.
 325     // (although the real reason was that the latter was buggy on xlc-13.1).
 326     __vector unsigned char perm0 = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
 327     __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
 328     __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
 329     __vector float         t0, t1;
 330
 331     t0                       = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
 332     t1                       = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
 333     triplets0->simdInternal_ = vec_perm(t0, scalar.simdInternal_, perm0);
 334     triplets1->simdInternal_ = vec_perm(t0, t1, perm1);
 335     triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, t1, perm2);
 336 }
 337
 338 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
 339    leading to segfaults. Possibly the calling convention doesn't
 340    implement __vector int correctly. Release mode is OK. gcc is OK. */
 341 template<int align>
 342 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
 343                                                              SimdFInt32   offset,
 344                                                              SimdFloat*   v0,
 345                                                              SimdFloat*   v1,
 346                                                              SimdFloat*   v2,
 347                                                              SimdFloat*   v3)
 348 {
 349     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 350
 351     store(ioffset, offset);
 352     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 353 }
 354
 355 template<int align>
 356 static inline void gmx_simdcall
 357                    gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 358 {
 359     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 360
 361     store(ioffset, offset);
 362     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 363 }
 364
 365 template<int align>
 366 static inline void gmx_simdcall
 367                    gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 368 {
 369     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 370
 371     store(ioffset, offset);
 372     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 373 }
 374
 375 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 376 {
 377     __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 378     __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 379     __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
 380     __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
 381     v0.simdInternal_  = vec_mergeh(t0, t2);
 382     v1.simdInternal_  = vec_mergel(t0, t2);
 383     v2.simdInternal_  = vec_mergeh(t1, t3);
 384     v3.simdInternal_  = vec_mergel(t1, t3);
 385
 386     v0 = v0 + v1;
 387     v2 = v2 + v3;
 388     v0 = v0 + v2;
 389     v2 = v0 + simdLoad(m);
 390     store(m, v2);
 391
 392     return reduce(v0);
 393 }
 394
 395 } // namespace gmx
 396
 397 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H