src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
  36 #define GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
  37
  38 #include "config.h"
  39
  40 #include <cstddef>
  41 #include <cstdint>
  42
  43 #include "gromacs/utility/basedefinitions.h"
  44
  45 #include "impl_ibm_vmx_definitions.h"
  46 #include "impl_ibm_vmx_simd_float.h"
  47
  48 namespace gmx
  49 {
  50
  51 template <int align>
  52 static inline void gmx_simdcall
  53 gatherLoadTranspose(const float *        base,
  54                     const std::int32_t   offset[],
  55                     SimdFloat *          v0,
  56                     SimdFloat *          v1,
  57                     SimdFloat *          v2,
  58                     SimdFloat *          v3)
  59 {
  60     *v0 = simdLoad( base + align * offset[0] );
  61     *v1 = simdLoad( base + align * offset[1] );
  62     *v2 = simdLoad( base + align * offset[2] );
  63     *v3 = simdLoad( base + align * offset[3] );
  64
  65     __vector float t0 = vec_mergeh(v0->simdInternal_, v2->simdInternal_);
  66     __vector float t1 = vec_mergel(v0->simdInternal_, v2->simdInternal_);
  67     __vector float t2 = vec_mergeh(v1->simdInternal_, v3->simdInternal_);
  68     __vector float t3 = vec_mergel(v1->simdInternal_, v3->simdInternal_);
  69     v0->simdInternal_ = vec_mergeh(t0, t2);
  70     v1->simdInternal_ = vec_mergel(t0, t2);
  71     v2->simdInternal_ = vec_mergeh(t1, t3);
  72     v3->simdInternal_ = vec_mergel(t1, t3);
  73 }
  74
  75 template <int align>
  76 static inline void gmx_simdcall
  77 gatherLoadTranspose(const float *        base,
  78                     const std::int32_t   offset[],
  79                     SimdFloat *          v0,
  80                     SimdFloat *          v1)
  81 {
  82     if (align % 4 == 0)
  83     {
  84         SimdFloat t2, t3;
  85
  86         gatherLoadTranspose<align>(base, offset, v0, v1, &t2, &t3);
  87     }
  88     else
  89     {
  90         __vector float         t0, t1, t2, t3, t4, t5, t6, t7;
  91         __vector unsigned char p0, p1, p2, p3;
  92
  93         // This is REALLY slow, since we have no choice but to load individual
  94         // elements when we cannot guarantee that we can access beyond the end of
  95         // the memory. Fortunately, 99% of the usage should be the aligned-to-4
  96         // case above instead.
  97         t0                = vec_lde(0, base + align * offset[0]);
  98         t1                = vec_lde(0, base + align * offset[1]);
  99         t2                = vec_lde(0, base + align * offset[2]);
 100         t3                = vec_lde(0, base + align * offset[3]);
 101         p0                = vec_lvsl(0, base + align * offset[0]);
 102         p1                = vec_lvsl(0, base + align * offset[1]);
 103         p2                = vec_lvsl(0, base + align * offset[2]);
 104         p3                = vec_lvsl(0, base + align * offset[3]);
 105         t0                = vec_perm(t0, t0, p0);
 106         t1                = vec_perm(t1, t1, p1);
 107         t2                = vec_perm(t2, t2, p2);
 108         t3                = vec_perm(t3, t3, p3);
 109         t0                = vec_mergeh(t0, t2);
 110         t1                = vec_mergeh(t1, t3);
 111         v0->simdInternal_ = vec_mergeh(t0, t1);
 112
 113         t4                = vec_lde(0, base + align * offset[0] + 1);
 114         t5                = vec_lde(0, base + align * offset[1] + 1);
 115         t6                = vec_lde(0, base + align * offset[2] + 1);
 116         t7                = vec_lde(0, base + align * offset[3] + 1);
 117         p0                = vec_lvsl(0, base + align * offset[0] + 1);
 118         p1                = vec_lvsl(0, base + align * offset[1] + 1);
 119         p2                = vec_lvsl(0, base + align * offset[2] + 1);
 120         p3                = vec_lvsl(0, base + align * offset[3] + 1);
 121         t4                = vec_perm(t4, t4, p0);
 122         t5                = vec_perm(t5, t5, p1);
 123         t6                = vec_perm(t6, t6, p2);
 124         t7                = vec_perm(t7, t7, p3);
 125         t4                = vec_mergeh(t4, t6);
 126         t5                = vec_mergeh(t5, t7);
 127         v1->simdInternal_ = vec_mergeh(t4, t5);
 128     }
 129 }
 130
 131 static const int c_simdBestPairAlignmentFloat = 2;
 132
 133 template <int align>
 134 static inline void gmx_simdcall
 135 gatherLoadUTranspose(const float *        base,
 136                      const std::int32_t   offset[],
 137                      SimdFloat *          v0,
 138                      SimdFloat *          v1,
 139                      SimdFloat *          v2)
 140 {
 141     if (align % 4 == 0)
 142     {
 143         SimdFloat t3;
 144         gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
 145     }
 146     else
 147     {
 148         __vector float         t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
 149         __vector unsigned char p0, p1, p2, p3;
 150
 151         // This is REALLY slow, since we have no choice but to load individual
 152         // elements when we cannot guarantee that we can access beyond the end of
 153         // the memory. Unfortunately this is likely the most common case.
 154         t0                = vec_lde(0, base + align * offset[0]);
 155         t1                = vec_lde(0, base + align * offset[1]);
 156         t2                = vec_lde(0, base + align * offset[2]);
 157         t3                = vec_lde(0, base + align * offset[3]);
 158         p0                = vec_lvsl(0, base + align * offset[0]);
 159         p1                = vec_lvsl(0, base + align * offset[1]);
 160         p2                = vec_lvsl(0, base + align * offset[2]);
 161         p3                = vec_lvsl(0, base + align * offset[3]);
 162         t0                = vec_perm(t0, t0, p0);
 163         t1                = vec_perm(t1, t1, p1);
 164         t2                = vec_perm(t2, t2, p2);
 165         t3                = vec_perm(t3, t3, p3);
 166         t0                = vec_mergeh(t0, t2);
 167         t1                = vec_mergeh(t1, t3);
 168         v0->simdInternal_ = vec_mergeh(t0, t1);
 169
 170         t4                = vec_lde(0, base + align * offset[0] + 1);
 171         t5                = vec_lde(0, base + align * offset[1] + 1);
 172         t6                = vec_lde(0, base + align * offset[2] + 1);
 173         t7                = vec_lde(0, base + align * offset[3] + 1);
 174         p0                = vec_lvsl(0, base + align * offset[0] + 1);
 175         p1                = vec_lvsl(0, base + align * offset[1] + 1);
 176         p2                = vec_lvsl(0, base + align * offset[2] + 1);
 177         p3                = vec_lvsl(0, base + align * offset[3] + 1);
 178         t4                = vec_perm(t4, t4, p0);
 179         t5                = vec_perm(t5, t5, p1);
 180         t6                = vec_perm(t6, t6, p2);
 181         t7                = vec_perm(t7, t7, p3);
 182         t4                = vec_mergeh(t4, t6);
 183         t5                = vec_mergeh(t5, t7);
 184         v1->simdInternal_ = vec_mergeh(t4, t5);
 185
 186         t8                = vec_lde(0, base + align * offset[0] + 2);
 187         t9                = vec_lde(0, base + align * offset[1] + 2);
 188         t10               = vec_lde(0, base + align * offset[2] + 2);
 189         t11               = vec_lde(0, base + align * offset[3] + 2);
 190         p0                = vec_lvsl(0, base + align * offset[0] + 2);
 191         p1                = vec_lvsl(0, base + align * offset[1] + 2);
 192         p2                = vec_lvsl(0, base + align * offset[2] + 2);
 193         p3                = vec_lvsl(0, base + align * offset[3] + 2);
 194         t8                = vec_perm(t8, t8, p0);
 195         t9                = vec_perm(t9, t9, p1);
 196         t10               = vec_perm(t10, t10, p2);
 197         t11               = vec_perm(t11, t11, p3);
 198         t8                = vec_mergeh(t8, t10);
 199         t9                = vec_mergeh(t9, t11);
 200         v2->simdInternal_ = vec_mergeh(t8, t9);
 201     }
 202 }
 203
 204
 205 template <int align>
 206 static inline void gmx_simdcall
 207 transposeScatterStoreU(float *              base,
 208                        const std::int32_t   offset[],
 209                        SimdFloat            v0,
 210                        SimdFloat            v1,
 211                        SimdFloat            v2)
 212 {
 213     __vector unsigned char p0, p1, p2, p3;
 214
 215     __vector float         t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 216     __vector float         t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 217     __vector float         t2 = vec_mergeh(v1.simdInternal_, v2.simdInternal_);
 218     __vector float         t3 = vec_mergel(v1.simdInternal_, v2.simdInternal_);
 219     __vector float         t4 = vec_mergeh(t0, t2);
 220     __vector float         t5 = vec_mergel(t0, t2);
 221     __vector float         t6 = vec_mergeh(t1, t3);
 222     __vector float         t7 = vec_mergel(t1, t3);
 223
 224     p0 = vec_lvsr(0, base + align * offset[0]);
 225     p1 = vec_lvsr(0, base + align * offset[1]);
 226     p2 = vec_lvsr(0, base + align * offset[2]);
 227     p3 = vec_lvsr(0, base + align * offset[3]);
 228
 229     t4 = vec_perm(t4, t4, p0);
 230     t5 = vec_perm(t5, t5, p1);
 231     t6 = vec_perm(t6, t6, p2);
 232     t7 = vec_perm(t7, t7, p3);
 233
 234     vec_ste(t4, 0, base + align * offset[0]);
 235     vec_ste(t4, 4, base + align * offset[0]);
 236     vec_ste(t4, 8, base + align * offset[0]);
 237     vec_ste(t5, 0, base + align * offset[1]);
 238     vec_ste(t5, 4, base + align * offset[1]);
 239     vec_ste(t5, 8, base + align * offset[1]);
 240     vec_ste(t6, 0, base + align * offset[2]);
 241     vec_ste(t6, 4, base + align * offset[2]);
 242     vec_ste(t6, 8, base + align * offset[2]);
 243     vec_ste(t7, 0, base + align * offset[3]);
 244     vec_ste(t7, 4, base + align * offset[3]);
 245     vec_ste(t7, 8, base + align * offset[3]);
 246 }
 247
 248
 249 template <int align>
 250 static inline void gmx_simdcall
 251 transposeScatterIncrU(float *              base,
 252                       const std::int32_t   offset[],
 253                       SimdFloat            v0,
 254                       SimdFloat            v1,
 255                       SimdFloat            v2)
 256 {
 257     if (align % 4 == 0)
 258     {
 259         __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
 260         __vector float t0   = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 261         __vector float t1   = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 262         __vector float t2   = vec_mergeh(v1.simdInternal_, zero);
 263         __vector float t3   = vec_mergel(v1.simdInternal_, zero);
 264         __vector float t4   = vec_mergeh(t0, t2);
 265         __vector float t5   = vec_mergel(t0, t2);
 266         __vector float t6   = vec_mergeh(t1, t3);
 267         __vector float t7   = vec_mergel(t1, t3);
 268
 269         vec_st( vec_add( vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
 270         vec_st( vec_add( vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
 271         vec_st( vec_add( vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
 272         vec_st( vec_add( vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
 273     }
 274     else
 275     {
 276         alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
 277         alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
 278         alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 279
 280         vec_st(v0.simdInternal_, 0, rdata0);
 281         vec_st(v1.simdInternal_, 0, rdata1);
 282         vec_st(v2.simdInternal_, 0, rdata2);
 283
 284         base[align*offset[0] + 0] += rdata0[0];
 285         base[align*offset[0] + 1] += rdata1[0];
 286         base[align*offset[0] + 2] += rdata2[0];
 287         base[align*offset[1] + 0] += rdata0[1];
 288         base[align*offset[1] + 1] += rdata1[1];
 289         base[align*offset[1] + 2] += rdata2[1];
 290         base[align*offset[2] + 0] += rdata0[2];
 291         base[align*offset[2] + 1] += rdata1[2];
 292         base[align*offset[2] + 2] += rdata2[2];
 293         base[align*offset[3] + 0] += rdata0[3];
 294         base[align*offset[3] + 1] += rdata1[3];
 295         base[align*offset[3] + 2] += rdata2[3];
 296     }
 297 }
 298
 299 template <int align>
 300 static inline void gmx_simdcall
 301 transposeScatterDecrU(float *              base,
 302                       const std::int32_t   offset[],
 303                       SimdFloat            v0,
 304                       SimdFloat            v1,
 305                       SimdFloat            v2)
 306 {
 307     if (align % 4 == 0)
 308     {
 309         __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
 310         __vector float t0   = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 311         __vector float t1   = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 312         __vector float t2   = vec_mergeh(v1.simdInternal_, zero);
 313         __vector float t3   = vec_mergel(v1.simdInternal_, zero);
 314         __vector float t4   = vec_mergeh(t0, t2);
 315         __vector float t5   = vec_mergel(t0, t2);
 316         __vector float t6   = vec_mergeh(t1, t3);
 317         __vector float t7   = vec_mergel(t1, t3);
 318
 319         vec_st( vec_sub( vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
 320         vec_st( vec_sub( vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
 321         vec_st( vec_sub( vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
 322         vec_st( vec_sub( vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
 323     }
 324     else
 325     {
 326         alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
 327         alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
 328         alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 329
 330         vec_st(v0.simdInternal_, 0, rdata0);
 331         vec_st(v1.simdInternal_, 0, rdata1);
 332         vec_st(v2.simdInternal_, 0, rdata2);
 333
 334         base[align*offset[0] + 0] -= rdata0[0];
 335         base[align*offset[0] + 1] -= rdata1[0];
 336         base[align*offset[0] + 2] -= rdata2[0];
 337         base[align*offset[1] + 0] -= rdata0[1];
 338         base[align*offset[1] + 1] -= rdata1[1];
 339         base[align*offset[1] + 2] -= rdata2[1];
 340         base[align*offset[2] + 0] -= rdata0[2];
 341         base[align*offset[2] + 1] -= rdata1[2];
 342         base[align*offset[2] + 2] -= rdata2[2];
 343         base[align*offset[3] + 0] -= rdata0[3];
 344         base[align*offset[3] + 1] -= rdata1[3];
 345         base[align*offset[3] + 2] -= rdata2[3];
 346     }
 347 }
 348
 349 static inline void gmx_simdcall
 350 expandScalarsToTriplets(SimdFloat    scalar,
 351                         SimdFloat *  triplets0,
 352                         SimdFloat *  triplets1,
 353                         SimdFloat *  triplets2)
 354 {
 355     const __vector unsigned char perm0 = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 };
 356     const __vector unsigned char perm1 = { 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11 };
 357     const __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };
 358
 359     triplets0->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm0);
 360     triplets1->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm1);
 361     triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm2);
 362 }
 363
 364
 365 template <int align>
 366 static inline void gmx_simdcall
 367 gatherLoadBySimdIntTranspose(const float *  base,
 368                              SimdFInt32     offset,
 369                              SimdFloat *    v0,
 370                              SimdFloat *    v1,
 371                              SimdFloat *    v2,
 372                              SimdFloat *    v3)
 373 {
 374     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 375
 376     vec_st( offset.simdInternal_, 0, ioffset);
 377     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 378 }
 379
 380 template <int align>
 381 static inline void gmx_simdcall
 382 gatherLoadBySimdIntTranspose(const float *   base,
 383                              SimdFInt32      offset,
 384                              SimdFloat *     v0,
 385                              SimdFloat *     v1)
 386 {
 387     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 388
 389     vec_st( offset.simdInternal_, 0, ioffset);
 390     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 391 }
 392
 393
 394
 395 static inline float gmx_simdcall
 396 reduceIncr4ReturnSum(float *    m,
 397                      SimdFloat  v0,
 398                      SimdFloat  v1,
 399                      SimdFloat  v2,
 400                      SimdFloat  v3)
 401 {
 402     __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 403     __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 404     __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
 405     __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
 406     v0.simdInternal_ = vec_mergeh(t0, t2);
 407     v1.simdInternal_ = vec_mergel(t0, t2);
 408     v2.simdInternal_ = vec_mergeh(t1, t3);
 409     v3.simdInternal_ = vec_mergel(t1, t3);
 410
 411     v0 = v0 + v1;
 412     v2 = v2 + v3;
 413     v0 = v0 + v2;
 414     v2 = v0 + simdLoad(m);
 415     store(m, v2);
 416
 417     return reduce(v0);
 418 }
 419
 420 }      // namespace gmx
 421
 422 #endif // GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H