src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
   5  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
  38 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
  39
  40 #include "config.h"
  41
  42 #include "gromacs/utility/basedefinitions.h"
  43
  44 #include "impl_ibm_vsx_definitions.h"
  45 #include "impl_ibm_vsx_simd_float.h"
  46
  47 namespace gmx
  48 {
  49
  50 template<int align>
  51 static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
  52                                                     const std::int32_t offset[],
  53                                                     SimdFloat*         v0,
  54                                                     SimdFloat*         v1,
  55                                                     SimdFloat*         v2,
  56                                                     SimdFloat*         v3)
  57 {
  58     __vector float l0, l1, l2, l3;
  59
  60     l0 = simdLoad(base + align * offset[0]).simdInternal_;
  61     l1 = simdLoad(base + align * offset[1]).simdInternal_;
  62     l2 = simdLoad(base + align * offset[2]).simdInternal_;
  63     l3 = simdLoad(base + align * offset[3]).simdInternal_;
  64
  65     __vector float t0 = vec_mergeh(l0, l2);
  66     __vector float t1 = vec_mergel(l0, l2);
  67     __vector float t2 = vec_mergeh(l1, l3);
  68     __vector float t3 = vec_mergel(l1, l3);
  69     v0->simdInternal_ = vec_mergeh(t0, t2);
  70     v1->simdInternal_ = vec_mergel(t0, t2);
  71     v2->simdInternal_ = vec_mergeh(t1, t3);
  72     v3->simdInternal_ = vec_mergel(t1, t3);
  73 }
  74
  75 template<int align>
  76 static inline void gmx_simdcall
  77 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
  78 {
  79     __vector float t0, t1, t2, t3;
  80
  81     t0 = reinterpret_cast<__vector float>(
  82             vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
  83     t1 = reinterpret_cast<__vector float>(
  84             vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
  85     t2 = reinterpret_cast<__vector float>(
  86             vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
  87     t3 = reinterpret_cast<__vector float>(
  88             vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
  89     t0                = vec_mergeh(t0, t2);
  90     t1                = vec_mergeh(t1, t3);
  91     v0->simdInternal_ = vec_mergeh(t0, t1);
  92     v1->simdInternal_ = vec_mergel(t0, t1);
  93 }
  94
  95 static const int c_simdBestPairAlignmentFloat = 2;
  96
  97 template<int align>
  98 static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
  99                                                      const std::int32_t offset[],
 100                                                      SimdFloat*         v0,
 101                                                      SimdFloat*         v1,
 102                                                      SimdFloat*         v2)
 103 {
 104
 105     if (align % 4 == 0)
 106     {
 107         SimdFloat t3;
 108         gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
 109     }
 110     else
 111     {
 112         __vector float               t1, t2, t3, t4, t5, t6, t7, t8;
 113         const __vector unsigned char perm_lo2hi = { 0,  1,  2,  3,  4,  5,  6,  7,
 114                                                     16, 17, 18, 19, 20, 21, 22, 23 };
 115         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
 116                                                     8,  9,  10, 11, 12, 13, 14, 15 };
 117
 118         t1 = reinterpret_cast<__vector float>(
 119                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
 120         t2 = reinterpret_cast<__vector float>(
 121                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
 122         t3 = reinterpret_cast<__vector float>(
 123                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
 124         t4 = reinterpret_cast<__vector float>(
 125                 vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
 126         t5 = vec_splats(*(base + align * offset[0] + 2));
 127         t6 = vec_splats(*(base + align * offset[1] + 2));
 128         t7 = vec_splats(*(base + align * offset[2] + 2));
 129         t8 = vec_splats(*(base + align * offset[3] + 2));
 130
 131         t1                = vec_mergeh(t1, t2);
 132         t3                = vec_mergeh(t3, t4);
 133         v0->simdInternal_ = vec_perm(t1, t3, perm_lo2hi);
 134         v1->simdInternal_ = vec_perm(t3, t1, perm_hi2lo);
 135         t5                = vec_mergeh(t5, t6);
 136         t7                = vec_mergeh(t7, t8);
 137         v2->simdInternal_ = vec_perm(t5, t7, perm_lo2hi);
 138     }
 139 }
 140
 141
 142 // gcc-4.9 does not recognize that the argument to vec_extract() is used
 143 template<int align>
 144 static inline void gmx_simdcall transposeScatterStoreU(float*               base,
 145                                                        const std::int32_t   offset[],
 146                                                        SimdFloat            v0,
 147                                                        SimdFloat            v1,
 148                                                        SimdFloat gmx_unused v2)
 149 {
 150     __vector float t1, t2;
 151
 152     t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 153     t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 154     *reinterpret_cast<double*>(base + align * offset[0]) =
 155             vec_extract(reinterpret_cast<__vector double>(t1), 0);
 156     base[align * offset[0] + 2] = vec_extract(v2.simdInternal_, 0);
 157     *reinterpret_cast<double*>(base + align * offset[1]) =
 158             vec_extract(reinterpret_cast<__vector double>(t1), 1);
 159     base[align * offset[1] + 2] = vec_extract(v2.simdInternal_, 1);
 160     *reinterpret_cast<double*>(base + align * offset[2]) =
 161             vec_extract(reinterpret_cast<__vector double>(t2), 0);
 162     base[align * offset[2] + 2] = vec_extract(v2.simdInternal_, 2);
 163     *reinterpret_cast<double*>(base + align * offset[3]) =
 164             vec_extract(reinterpret_cast<__vector double>(t2), 1);
 165     base[align * offset[3] + 2] = vec_extract(v2.simdInternal_, 3);
 166 }
 167
 168 template<int align>
 169 static inline void gmx_simdcall
 170 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 171 {
 172     if (align < 4)
 173     {
 174         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
 175                                                     8,  9,  10, 11, 12, 13, 14, 15 };
 176         __vector float               t0, t1, t2, t3, t4, t5, t6, t7;
 177
 178         t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
 179         t1 = vec_perm(t0, t0, perm_hi2lo);                   // x1 y1
 180         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
 181         t3 = vec_perm(t2, t2, perm_hi2lo);                   // x3 y3
 182
 183         t4 = reinterpret_cast<__vector float>(
 184                 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
 185         t4 = vec_add(t4, t0);
 186         *reinterpret_cast<double*>(base + align * offset[0]) =
 187                 vec_extract(reinterpret_cast<__vector double>(t4), 0);
 188         {
 189             float extracted = vec_extract(v2.simdInternal_, 0);
 190             base[align * offset[0] + 2] += extracted;
 191         }
 192
 193         t5 = reinterpret_cast<__vector float>(
 194                 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
 195         t5 = vec_add(t5, t1);
 196         *reinterpret_cast<double*>(base + align * offset[1]) =
 197                 vec_extract(reinterpret_cast<__vector double>(t5), 0);
 198         {
 199             float extracted = vec_extract(v2.simdInternal_, 1);
 200             base[align * offset[1] + 2] += extracted;
 201         }
 202
 203         t6 = reinterpret_cast<__vector float>(
 204                 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
 205         t6 = vec_add(t6, t2);
 206         *reinterpret_cast<double*>(base + align * offset[2]) =
 207                 vec_extract(reinterpret_cast<__vector double>(t6), 0);
 208         {
 209             float extracted = vec_extract(v2.simdInternal_, 2);
 210             base[align * offset[2] + 2] += extracted;
 211         }
 212
 213         t7 = reinterpret_cast<__vector float>(
 214                 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
 215         t7 = vec_add(t7, t3);
 216         *reinterpret_cast<double*>(base + align * offset[3]) =
 217                 vec_extract(reinterpret_cast<__vector double>(t7), 0);
 218         {
 219             float extracted = vec_extract(v2.simdInternal_, 3);
 220             base[align * offset[3] + 2] += extracted;
 221         }
 222     }
 223     else
 224     {
 225         // Extra elements means we can use full width-4 load/store operations
 226         SimdFloat      v3;
 227         __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 228         __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 229         __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
 230         __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
 231         v0.simdInternal_  = vec_mergeh(t0, t2);
 232         v1.simdInternal_  = vec_mergel(t0, t2);
 233         v2.simdInternal_  = vec_mergeh(t1, t3);
 234         v3.simdInternal_  = vec_mergel(t1, t3);
 235
 236         store(base + align * offset[0], simdLoad(base + align * offset[0]) + v0);
 237         store(base + align * offset[1], simdLoad(base + align * offset[1]) + v1);
 238         store(base + align * offset[2], simdLoad(base + align * offset[2]) + v2);
 239         store(base + align * offset[3], simdLoad(base + align * offset[3]) + v3);
 240     }
 241 }
 242
 243 template<int align>
 244 static inline void gmx_simdcall
 245 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 246 {
 247     if (align < 4)
 248     {
 249         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
 250                                                     8,  9,  10, 11, 12, 13, 14, 15 };
 251         __vector float               t0, t1, t2, t3, t4, t5, t6, t7;
 252
 253         t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
 254         t1 = vec_perm(t0, t0, perm_hi2lo);                   // x1 y1
 255         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
 256         t3 = vec_perm(t2, t2, perm_hi2lo);                   // x3 y3
 257
 258         t4 = reinterpret_cast<__vector float>(
 259                 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
 260         t4 = vec_sub(t4, t0);
 261         *reinterpret_cast<double*>(base + align * offset[0]) =
 262                 vec_extract(reinterpret_cast<__vector double>(t4), 0);
 263         {
 264             float extracted = vec_extract(v2.simdInternal_, 0);
 265             base[align * offset[0] + 2] -= extracted;
 266         }
 267
 268         t5 = reinterpret_cast<__vector float>(
 269                 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
 270         t5 = vec_sub(t5, t1);
 271         *reinterpret_cast<double*>(base + align * offset[1]) =
 272                 vec_extract(reinterpret_cast<__vector double>(t5), 0);
 273         {
 274             float extracted = vec_extract(v2.simdInternal_, 1);
 275             base[align * offset[1] + 2] -= extracted;
 276         }
 277
 278         t6 = reinterpret_cast<__vector float>(
 279                 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
 280         t6 = vec_sub(t6, t2);
 281         *reinterpret_cast<double*>(base + align * offset[2]) =
 282                 vec_extract(reinterpret_cast<__vector double>(t6), 0);
 283         {
 284             float extracted = vec_extract(v2.simdInternal_, 2);
 285             base[align * offset[2] + 2] -= extracted;
 286         }
 287
 288         t7 = reinterpret_cast<__vector float>(
 289                 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
 290         t7 = vec_sub(t7, t3);
 291         *reinterpret_cast<double*>(base + align * offset[3]) =
 292                 vec_extract(reinterpret_cast<__vector double>(t7), 0);
 293         {
 294             float extracted = vec_extract(v2.simdInternal_, 3);
 295             base[align * offset[3] + 2] -= extracted;
 296         }
 297     }
 298     else
 299     {
 300         // Extra elements means we can use full width-4 load/store operations
 301         SimdFloat      v3;
 302         __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 303         __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 304         __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
 305         __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
 306         v0.simdInternal_  = vec_mergeh(t0, t2);
 307         v1.simdInternal_  = vec_mergel(t0, t2);
 308         v2.simdInternal_  = vec_mergeh(t1, t3);
 309         v3.simdInternal_  = vec_mergel(t1, t3);
 310
 311         store(base + align * offset[0], simdLoad(base + align * offset[0]) - v0);
 312         store(base + align * offset[1], simdLoad(base + align * offset[1]) - v1);
 313         store(base + align * offset[2], simdLoad(base + align * offset[2]) - v2);
 314         store(base + align * offset[3], simdLoad(base + align * offset[3]) - v3);
 315     }
 316 }
 317
 318 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
 319                                                         SimdFloat* triplets0,
 320                                                         SimdFloat* triplets1,
 321                                                         SimdFloat* triplets2)
 322 {
 323     // These permutes will be translated to immediate permutes (xxpermdi)
 324     // since they operate on doublewords, which will be faster than loading
 325     // the constants required for fully flexible permutes.
 326     // (although the real reason was that the latter was buggy on xlc-13.1).
 327     __vector unsigned char perm0 = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
 328     __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
 329     __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
 330     __vector float         t0, t1;
 331
 332     t0                       = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
 333     t1                       = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
 334     triplets0->simdInternal_ = vec_perm(t0, scalar.simdInternal_, perm0);
 335     triplets1->simdInternal_ = vec_perm(t0, t1, perm1);
 336     triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, t1, perm2);
 337 }
 338
 339 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
 340    leading to segfaults. Possibly the calling convention doesn't
 341    implement __vector int correctly. Release mode is OK. gcc is OK. */
 342 template<int align>
 343 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
 344                                                              SimdFInt32   offset,
 345                                                              SimdFloat*   v0,
 346                                                              SimdFloat*   v1,
 347                                                              SimdFloat*   v2,
 348                                                              SimdFloat*   v3)
 349 {
 350     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 351
 352     store(ioffset, offset);
 353     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 354 }
 355
 356 template<int align>
 357 static inline void gmx_simdcall
 358 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 359 {
 360     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 361
 362     store(ioffset, offset);
 363     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 364 }
 365
 366 template<int align>
 367 static inline void gmx_simdcall
 368 gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 369 {
 370     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 371
 372     store(ioffset, offset);
 373     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 374 }
 375
 376 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 377 {
 378     __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 379     __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 380     __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
 381     __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
 382     v0.simdInternal_  = vec_mergeh(t0, t2);
 383     v1.simdInternal_  = vec_mergel(t0, t2);
 384     v2.simdInternal_  = vec_mergeh(t1, t3);
 385     v3.simdInternal_  = vec_mergel(t1, t3);
 386
 387     v0 = v0 + v1;
 388     v2 = v2 + v3;
 389     v0 = v0 + v2;
 390     v2 = v0 + simdLoad(m);
 391     store(m, v2);
 392
 393     return reduce(v0);
 394 }
 395
 396 } // namespace gmx
 397
 398 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H