src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
  37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include "gromacs/utility/basedefinitions.h"
  42
  43 #include "impl_ibm_vsx_definitions.h"
  44 #include "impl_ibm_vsx_simd_float.h"
  45
  46 namespace gmx
  47 {
  48
  49 template <int align>
  50 static inline void gmx_simdcall
  51 gatherLoadTranspose(const float *        base,
  52                     const std::int32_t   offset[],
  53                     SimdFloat *          v0,
  54                     SimdFloat *          v1,
  55                     SimdFloat *          v2,
  56                     SimdFloat *          v3)
  57 {
  58     __vector float l0, l1, l2, l3;
  59
  60     l0 = simdLoad( base + align * offset[0] ).simdInternal_;
  61     l1 = simdLoad( base + align * offset[1] ).simdInternal_;
  62     l2 = simdLoad( base + align * offset[2] ).simdInternal_;
  63     l3 = simdLoad( base + align * offset[3] ).simdInternal_;
  64
  65     __vector float t0 = vec_mergeh(l0, l2);
  66     __vector float t1 = vec_mergel(l0, l2);
  67     __vector float t2 = vec_mergeh(l1, l3);
  68     __vector float t3 = vec_mergel(l1, l3);
  69     v0->simdInternal_ = vec_mergeh(t0, t2);
  70     v1->simdInternal_ = vec_mergel(t0, t2);
  71     v2->simdInternal_ = vec_mergeh(t1, t3);
  72     v3->simdInternal_ = vec_mergel(t1, t3);
  73 }
  74
  75 template <int align>
  76 static inline void gmx_simdcall
  77 gatherLoadTranspose(const float *        base,
  78                     const std::int32_t   offset[],
  79                     SimdFloat *          v0,
  80                     SimdFloat *          v1)
  81 {
  82     __vector float t0, t1, t2, t3;
  83
  84     t0                = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[0])));
  85     t1                = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[1])));
  86     t2                = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[2])));
  87     t3                = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[3])));
  88     t0                = vec_mergeh(t0, t2);
  89     t1                = vec_mergeh(t1, t3);
  90     v0->simdInternal_ = vec_mergeh(t0, t1);
  91     v1->simdInternal_ = vec_mergel(t0, t1);
  92 }
  93
  94 static const int c_simdBestPairAlignmentFloat = 2;
  95
  96 template <int align>
  97 static inline void gmx_simdcall
  98 gatherLoadUTranspose(const float *        base,
  99                      const std::int32_t   offset[],
 100                      SimdFloat *          v0,
 101                      SimdFloat *          v1,
 102                      SimdFloat *          v2)
 103 {
 104
 105     if (align % 4 == 0)
 106     {
 107         SimdFloat t3;
 108         gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
 109     }
 110     else
 111     {
 112         __vector float               t1, t2, t3, t4, t5, t6, t7, t8;
 113         const __vector unsigned char perm_lo2hi = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
 114         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
 115
 116         t1  = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[0])));
 117         t2  = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[1])));
 118         t3  = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[2])));
 119         t4  = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[3])));
 120         t5  = vec_splats( *(base + align * offset[0] + 2) );
 121         t6  = vec_splats( *(base + align * offset[1] + 2) );
 122         t7  = vec_splats( *(base + align * offset[2] + 2) );
 123         t8  = vec_splats( *(base + align * offset[3] + 2) );
 124
 125         t1                = vec_mergeh(t1, t2);
 126         t3                = vec_mergeh(t3, t4);
 127         v0->simdInternal_ = vec_perm(t1, t3, perm_lo2hi);
 128         v1->simdInternal_ = vec_perm(t3, t1, perm_hi2lo);
 129         t5                = vec_mergeh(t5, t6);
 130         t7                = vec_mergeh(t7, t8);
 131         v2->simdInternal_ = vec_perm(t5, t7, perm_lo2hi);
 132     }
 133 }
 134
 135
 136 // gcc-4.9 does not recognize that the argument to vec_extract() is used
 137 template <int align>
 138 static inline void gmx_simdcall
 139 transposeScatterStoreU(float *              base,
 140                        const std::int32_t   offset[],
 141                        SimdFloat            v0,
 142                        SimdFloat            v1,
 143                        SimdFloat gmx_unused v2)
 144 {
 145     __vector float t1, t2;
 146
 147     t1   = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 148     t2   = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 149     *reinterpret_cast<double *>( base + align * offset[0] ) = vec_extract(reinterpret_cast<__vector double>(t1), 0);
 150     base[align*offset[0] + 2]                               = vec_extract(v2.simdInternal_, 0);
 151     *reinterpret_cast<double *>( base + align * offset[1] ) = vec_extract(reinterpret_cast<__vector double>(t1), 1);
 152     base[align*offset[1] + 2]                               = vec_extract(v2.simdInternal_, 1);
 153     *reinterpret_cast<double *>( base + align * offset[2] ) = vec_extract(reinterpret_cast<__vector double>(t2), 0);
 154     base[align*offset[2] + 2]                               = vec_extract(v2.simdInternal_, 2);
 155     *reinterpret_cast<double *>( base + align * offset[3] ) = vec_extract(reinterpret_cast<__vector double>(t2), 1);
 156     base[align*offset[3] + 2]                               = vec_extract(v2.simdInternal_, 3);
 157 }
 158
 159 template <int align>
 160 static inline void gmx_simdcall
 161 transposeScatterIncrU(float *              base,
 162                       const std::int32_t   offset[],
 163                       SimdFloat            v0,
 164                       SimdFloat            v1,
 165                       SimdFloat            v2)
 166 {
 167     if (align < 4)
 168     {
 169         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
 170         __vector float               t0, t1, t2, t3, t4, t5, t6, t7;
 171
 172         t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
 173         t1 = vec_perm(t0, t0, perm_hi2lo);                   // x1 y1
 174         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
 175         t3 = vec_perm(t2, t2, perm_hi2lo);                   // x3 y3
 176
 177         t4 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[0])));
 178         t4 = vec_add(t4, t0);
 179         *reinterpret_cast<double *>( base + align * offset[0] ) = vec_extract(reinterpret_cast<__vector double>(t4), 0);
 180         {
 181             float extracted = vec_extract(v2.simdInternal_, 0);
 182             base[align*offset[0] + 2] += extracted;
 183         }
 184
 185         t5 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[1])));
 186         t5 = vec_add(t5, t1);
 187         *reinterpret_cast<double *>( base + align * offset[1] ) = vec_extract(reinterpret_cast<__vector double>(t5), 0);
 188         {
 189             float extracted = vec_extract(v2.simdInternal_, 1);
 190             base[align*offset[1] + 2] += extracted;
 191         }
 192
 193         t6 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[2])));
 194         t6 = vec_add(t6, t2);
 195         *reinterpret_cast<double *>( base + align * offset[2] ) = vec_extract(reinterpret_cast<__vector double>(t6), 0);
 196         {
 197             float extracted = vec_extract(v2.simdInternal_, 2);
 198             base[align*offset[2] + 2] += extracted;
 199         }
 200
 201         t7 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[3])));
 202         t7 = vec_add(t7, t3);
 203         *reinterpret_cast<double *>( base + align * offset[3] ) = vec_extract(reinterpret_cast<__vector double>(t7), 0);
 204         {
 205             float extracted = vec_extract(v2.simdInternal_, 3);
 206             base[align*offset[3] + 2] += extracted;
 207         }
 208     }
 209     else
 210     {
 211         // Extra elements means we can use full width-4 load/store operations
 212         SimdFloat      v3;
 213         __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 214         __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 215         __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0f));
 216         __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0f));
 217         v0.simdInternal_ = vec_mergeh(t0, t2);
 218         v1.simdInternal_ = vec_mergel(t0, t2);
 219         v2.simdInternal_ = vec_mergeh(t1, t3);
 220         v3.simdInternal_ = vec_mergel(t1, t3);
 221
 222         store(base + align * offset[0], simdLoad(base + align * offset[0]) + v0 );
 223         store(base + align * offset[1], simdLoad(base + align * offset[1]) + v1 );
 224         store(base + align * offset[2], simdLoad(base + align * offset[2]) + v2 );
 225         store(base + align * offset[3], simdLoad(base + align * offset[3]) + v3 );
 226     }
 227 }
 228
 229 template <int align>
 230 static inline void gmx_simdcall
 231 transposeScatterDecrU(float *              base,
 232                       const std::int32_t   offset[],
 233                       SimdFloat            v0,
 234                       SimdFloat            v1,
 235                       SimdFloat            v2)
 236 {
 237     if (align < 4)
 238     {
 239         const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
 240         __vector float               t0, t1, t2, t3, t4, t5, t6, t7;
 241
 242         t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
 243         t1 = vec_perm(t0, t0, perm_hi2lo);                   // x1 y1
 244         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
 245         t3 = vec_perm(t2, t2, perm_hi2lo);                   // x3 y3
 246
 247         t4 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[0])));
 248         t4 = vec_sub(t4, t0);
 249         *reinterpret_cast<double *>( base + align * offset[0] ) = vec_extract(reinterpret_cast<__vector double>(t4), 0);
 250         {
 251             float extracted = vec_extract(v2.simdInternal_, 0);
 252             base[align*offset[0] + 2] -= extracted;
 253         }
 254
 255         t5 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[1])));
 256         t5 = vec_sub(t5, t1);
 257         *reinterpret_cast<double *>( base + align * offset[1] ) = vec_extract(reinterpret_cast<__vector double>(t5), 0);
 258         {
 259             float extracted = vec_extract(v2.simdInternal_, 1);
 260             base[align*offset[1] + 2] -= extracted;
 261         }
 262
 263         t6 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[2])));
 264         t6 = vec_sub(t6, t2);
 265         *reinterpret_cast<double *>( base + align * offset[2] ) = vec_extract(reinterpret_cast<__vector double>(t6), 0);
 266         {
 267             float extracted = vec_extract(v2.simdInternal_, 2);
 268             base[align*offset[2] + 2] -= extracted;
 269         }
 270
 271         t7 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[3])));
 272         t7 = vec_sub(t7, t3);
 273         *reinterpret_cast<double *>( base + align * offset[3] ) = vec_extract(reinterpret_cast<__vector double>(t7), 0);
 274         {
 275             float extracted = vec_extract(v2.simdInternal_, 3);
 276             base[align*offset[3] + 2] -= extracted;
 277         }
 278     }
 279     else
 280     {
 281         // Extra elements means we can use full width-4 load/store operations
 282         SimdFloat      v3;
 283         __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 284         __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 285         __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0f));
 286         __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0f));
 287         v0.simdInternal_ = vec_mergeh(t0, t2);
 288         v1.simdInternal_ = vec_mergel(t0, t2);
 289         v2.simdInternal_ = vec_mergeh(t1, t3);
 290         v3.simdInternal_ = vec_mergel(t1, t3);
 291
 292         store(base + align * offset[0], simdLoad(base + align * offset[0]) - v0 );
 293         store(base + align * offset[1], simdLoad(base + align * offset[1]) - v1 );
 294         store(base + align * offset[2], simdLoad(base + align * offset[2]) - v2 );
 295         store(base + align * offset[3], simdLoad(base + align * offset[3]) - v3 );
 296     }
 297 }
 298
 299 static inline void gmx_simdcall
 300 expandScalarsToTriplets(SimdFloat    scalar,
 301                         SimdFloat *  triplets0,
 302                         SimdFloat *  triplets1,
 303                         SimdFloat *  triplets2)
 304 {
 305     // These permutes will be translated to immediate permutes (xxpermdi)
 306     // since they operate on doublewords, which will be faster than loading
 307     // the constants required for fully flexible permutes.
 308     // (although the real reason was that the latter was buggy on xlc-13.1).
 309     __vector unsigned char perm0 = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
 310     __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
 311     __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
 312     __vector float         t0, t1;
 313
 314     t0                       = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
 315     t1                       = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
 316     triplets0->simdInternal_ = vec_perm(t0, scalar.simdInternal_, perm0);
 317     triplets1->simdInternal_ = vec_perm(t0, t1, perm1);
 318     triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, t1, perm2);
 319 }
 320
 321 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
 322    leading to segfaults. Possibly the calling convention doesn't
 323    implement __vector int correctly. Release mode is OK. gcc is OK. */
 324 template <int align>
 325 static inline void gmx_simdcall
 326 gatherLoadBySimdIntTranspose(const float *  base,
 327                              SimdFInt32     offset,
 328                              SimdFloat *    v0,
 329                              SimdFloat *    v1,
 330                              SimdFloat *    v2,
 331                              SimdFloat *    v3)
 332 {
 333     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 334
 335     store(ioffset, offset );
 336     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 337 }
 338
 339 template <int align>
 340 static inline void gmx_simdcall
 341 gatherLoadBySimdIntTranspose(const float *   base,
 342                              SimdFInt32      offset,
 343                              SimdFloat *     v0,
 344                              SimdFloat *     v1)
 345 {
 346     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 347
 348     store(ioffset, offset );
 349     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 350 }
 351
 352 template <int align>
 353 static inline void gmx_simdcall
 354 gatherLoadUBySimdIntTranspose(const float *  base,
 355                               SimdFInt32     offset,
 356                               SimdFloat *    v0,
 357                               SimdFloat *    v1)
 358 {
 359     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 360
 361     store(ioffset, offset );
 362     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 363 }
 364
 365 static inline float gmx_simdcall
 366 reduceIncr4ReturnSum(float *    m,
 367                      SimdFloat  v0,
 368                      SimdFloat  v1,
 369                      SimdFloat  v2,
 370                      SimdFloat  v3)
 371 {
 372     __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 373     __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 374     __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
 375     __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
 376     v0.simdInternal_ = vec_mergeh(t0, t2);
 377     v1.simdInternal_ = vec_mergel(t0, t2);
 378     v2.simdInternal_ = vec_mergeh(t1, t3);
 379     v3.simdInternal_ = vec_mergel(t1, t3);
 380
 381     v0 = v0 + v1;
 382     v2 = v2 + v3;
 383     v0 = v0 + v2;
 384     v2 = v0 + simdLoad(m);
 385     store(m, v2);
 386
 387     return reduce(v0);
 388 }
 389
 390 }      // namespace gmx
 391
 392 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H