src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_QPX_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPLEMENTATION_IBM_QPX_UTIL_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 // Assert is buggy on xlc with high optimization, so we skip it for QPX
  42 #include <cstddef>
  43 #include <cstdint>
  44
  45 #ifdef __clang__
  46 #    include <qpxmath.h>
  47 #endif
  48
  49 #include "gromacs/utility/basedefinitions.h"
  50
  51 #include "impl_ibm_qpx_simd_double.h"
  52
  53 namespace gmx
  54 {
  55
  56 template <int align>
  57 static inline void gmx_simdcall
  58 gatherLoadTranspose(const double  *       base,
  59                     const std::int32_t    offset[],
  60                     SimdDouble *          v0,
  61                     SimdDouble *          v1,
  62                     SimdDouble *          v2,
  63                     SimdDouble *          v3)
  64 {
  65     v0->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[0]) );
  66     v1->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[1]) );
  67     v2->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[2]) );
  68     v3->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[3]) );
  69
  70     vector4double t0 = vec_perm(v0->simdInternal_, v2->simdInternal_, vec_gpci(00415));
  71     vector4double t1 = vec_perm(v0->simdInternal_, v2->simdInternal_, vec_gpci(02637));
  72     vector4double t2 = vec_perm(v1->simdInternal_, v3->simdInternal_, vec_gpci(00415));
  73     vector4double t3 = vec_perm(v1->simdInternal_, v3->simdInternal_, vec_gpci(02637));
  74     v0->simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
  75     v1->simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
  76     v2->simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
  77     v3->simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
  78 }
  79
  80 template <int align>
  81 static inline void gmx_simdcall
  82 gatherLoadTranspose(const double *       base,
  83                     const std::int32_t   offset[],
  84                     SimdDouble *         v0,
  85                     SimdDouble *         v1)
  86 {
  87     vector4double t0, t1, t2, t3;
  88
  89     t0                = vec_ld2(0, const_cast<double *>(base + align * offset[0]) );
  90     t1                = vec_ld2(0, const_cast<double *>(base + align * offset[1]) );
  91     t2                = vec_ld2(0, const_cast<double *>(base + align * offset[2]) );
  92     t3                = vec_ld2(0, const_cast<double *>(base + align * offset[3]) );
  93     t0                = vec_perm(t0, t2, vec_gpci(00415));
  94     t1                = vec_perm(t1, t3, vec_gpci(00415));
  95     v0->simdInternal_ = vec_perm(t0, t1, vec_gpci(00415));
  96     v1->simdInternal_ = vec_perm(t0, t1, vec_gpci(02637));
  97 }
  98
  99 static const int c_simdBestPairAlignmentDouble = 2;
 100
 101 template <int align>
 102 static inline void gmx_simdcall
 103 gatherLoadUTranspose(const double  *       base,
 104                      const std::int32_t    offset[],
 105                      SimdDouble *          v0,
 106                      SimdDouble *          v1,
 107                      SimdDouble *          v2)
 108 {
 109     vector4double             t1, t2, t3, t4, t5, t6, t7, t8;
 110
 111     if (align % 4 == 0)
 112     {
 113         SimdDouble t3;
 114         gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
 115     }
 116     else
 117     {
 118         t1  = vec_perm(vec_splats(base[align * offset[0]]), vec_splats(base[align * offset[0] + 1]), vec_gpci(00415));
 119         t2  = vec_perm(vec_splats(base[align * offset[1]]), vec_splats(base[align * offset[1] + 1]), vec_gpci(00415));
 120         t3  = vec_perm(vec_splats(base[align * offset[2]]), vec_splats(base[align * offset[2] + 1]), vec_gpci(00415));
 121         t4  = vec_perm(vec_splats(base[align * offset[3]]), vec_splats(base[align * offset[3] + 1]), vec_gpci(00415));
 122
 123         t5  = vec_splats( *(base + align * offset[0] + 2) );
 124         t6  = vec_splats( *(base + align * offset[1] + 2) );
 125         t7  = vec_splats( *(base + align * offset[2] + 2) );
 126         t8  = vec_splats( *(base + align * offset[3] + 2) );
 127
 128         t1                = vec_perm(t1, t2, vec_gpci(00415));
 129         t3                = vec_perm(t3, t4, vec_gpci(00415));
 130         v0->simdInternal_ = vec_perm(t1, t3, vec_gpci(00145));
 131         v1->simdInternal_ = vec_perm(t3, t1, vec_gpci(06723));
 132         t5                = vec_perm(t5, t6, vec_gpci(00415));
 133         t7                = vec_perm(t7, t8, vec_gpci(00415));
 134         v2->simdInternal_ = vec_perm(t5, t7, vec_gpci(00145));
 135     }
 136 }
 137
 138 template <int align>
 139 static inline void gmx_simdcall
 140 transposeScatterStoreU(double  *             base,
 141                        const std::int32_t    offset[],
 142                        SimdDouble            v0,
 143                        SimdDouble            v1,
 144                        SimdDouble            v2)
 145 {
 146     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m0[GMX_SIMD_DOUBLE_WIDTH];
 147     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m1[GMX_SIMD_DOUBLE_WIDTH];
 148     GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m2[GMX_SIMD_DOUBLE_WIDTH];
 149
 150     store(m0, v0);
 151     store(m1, v1);
 152     store(m2, v2);
 153
 154     base[align * offset[0]    ] = m0[0];
 155     base[align * offset[0] + 1] = m1[0];
 156     base[align * offset[0] + 2] = m2[0];
 157     base[align * offset[1]    ] = m0[1];
 158     base[align * offset[1] + 1] = m1[1];
 159     base[align * offset[1] + 2] = m2[1];
 160     base[align * offset[2]    ] = m0[2];
 161     base[align * offset[2] + 1] = m1[2];
 162     base[align * offset[2] + 2] = m2[2];
 163     base[align * offset[3]    ] = m0[3];
 164     base[align * offset[3] + 1] = m1[3];
 165     base[align * offset[3] + 2] = m2[3];
 166 }
 167
 168 template <int align>
 169 static inline void gmx_simdcall
 170 transposeScatterIncrU(double  *             base,
 171                       const std::int32_t    offset[],
 172                       SimdDouble            v0,
 173                       SimdDouble            v1,
 174                       SimdDouble            v2)
 175 {
 176     if (align % 4 == 0)
 177     {
 178         // transpose
 179         SimdDouble    v3(0.0);
 180         vector4double t0 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(00415));
 181         vector4double t1 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(02637));
 182         vector4double t2 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(00415));
 183         vector4double t3 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(02637));
 184         v0.simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
 185         v1.simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
 186         v2.simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
 187         v3.simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
 188         // increment
 189         store(base + align*offset[0], simdLoad(base + align*offset[0]) + v0);
 190         store(base + align*offset[1], simdLoad(base + align*offset[1]) + v1);
 191         store(base + align*offset[2], simdLoad(base + align*offset[2]) + v2);
 192         store(base + align*offset[3], simdLoad(base + align*offset[3]) + v3);
 193     }
 194     else
 195     {
 196         GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m0[GMX_SIMD_DOUBLE_WIDTH];
 197         GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m1[GMX_SIMD_DOUBLE_WIDTH];
 198         GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m2[GMX_SIMD_DOUBLE_WIDTH];
 199
 200         store(m0, v0);
 201         store(m1, v1);
 202         store(m2, v2);
 203
 204         base[align * offset[0]    ] += m0[0];
 205         base[align * offset[0] + 1] += m1[0];
 206         base[align * offset[0] + 2] += m2[0];
 207         base[align * offset[1]    ] += m0[1];
 208         base[align * offset[1] + 1] += m1[1];
 209         base[align * offset[1] + 2] += m2[1];
 210         base[align * offset[2]    ] += m0[2];
 211         base[align * offset[2] + 1] += m1[2];
 212         base[align * offset[2] + 2] += m2[2];
 213         base[align * offset[3]    ] += m0[3];
 214         base[align * offset[3] + 1] += m1[3];
 215         base[align * offset[3] + 2] += m2[3];
 216     }
 217 }
 218
 219 template <int align>
 220 static inline void gmx_simdcall
 221 transposeScatterDecrU(double  *             base,
 222                       const std::int32_t    offset[],
 223                       SimdDouble            v0,
 224                       SimdDouble            v1,
 225                       SimdDouble            v2)
 226 {
 227     if (align % 4 == 0)
 228     {
 229         // transpose
 230         SimdDouble    v3(0.0);
 231         vector4double t0 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(00415));
 232         vector4double t1 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(02637));
 233         vector4double t2 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(00415));
 234         vector4double t3 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(02637));
 235         v0.simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
 236         v1.simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
 237         v2.simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
 238         v3.simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
 239         // decrement
 240         store(base + align*offset[0], simdLoad(base + align*offset[0]) - v0);
 241         store(base + align*offset[1], simdLoad(base + align*offset[1]) - v1);
 242         store(base + align*offset[2], simdLoad(base + align*offset[2]) - v2);
 243         store(base + align*offset[3], simdLoad(base + align*offset[3]) - v3);
 244     }
 245     else
 246     {
 247         GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m0[GMX_SIMD_DOUBLE_WIDTH];
 248         GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m1[GMX_SIMD_DOUBLE_WIDTH];
 249         GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)   m2[GMX_SIMD_DOUBLE_WIDTH];
 250
 251         store(m0, v0);
 252         store(m1, v1);
 253         store(m2, v2);
 254
 255         base[align * offset[0]    ] -= m0[0];
 256         base[align * offset[0] + 1] -= m1[0];
 257         base[align * offset[0] + 2] -= m2[0];
 258         base[align * offset[1]    ] -= m0[1];
 259         base[align * offset[1] + 1] -= m1[1];
 260         base[align * offset[1] + 2] -= m2[1];
 261         base[align * offset[2]    ] -= m0[2];
 262         base[align * offset[2] + 1] -= m1[2];
 263         base[align * offset[2] + 2] -= m2[2];
 264         base[align * offset[3]    ] -= m0[3];
 265         base[align * offset[3] + 1] -= m1[3];
 266         base[align * offset[3] + 2] -= m2[3];
 267     }
 268 }
 269
 270 static inline void gmx_simdcall
 271 expandScalarsToTriplets(SimdDouble    scalar,
 272                         SimdDouble *  triplets0,
 273                         SimdDouble *  triplets1,
 274                         SimdDouble *  triplets2)
 275 {
 276     triplets0->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, vec_gpci(00001));
 277     triplets1->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, vec_gpci(01122));
 278     triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, vec_gpci(02333));
 279 }
 280
 281 template <int align>
 282 static inline void gmx_simdcall
 283 gatherLoadBySimdIntTranspose(const double  *  base,
 284                              SimdDInt32       simdoffset,
 285                              SimdDouble *     v0,
 286                              SimdDouble *     v1,
 287                              SimdDouble *     v2,
 288                              SimdDouble *     v3)
 289 {
 290     GMX_ALIGNED(int, GMX_SIMD_DOUBLE_WIDTH)   ioffset[GMX_SIMD_DOUBLE_WIDTH];
 291
 292     store(ioffset, simdoffset);
 293     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 294 }
 295
 296 template <int align>
 297 static inline void gmx_simdcall
 298 gatherLoadBySimdIntTranspose(const double  *  base,
 299                              SimdDInt32       simdoffset,
 300                              SimdDouble *     v0,
 301                              SimdDouble *     v1)
 302 {
 303     GMX_ALIGNED(int, GMX_SIMD_DOUBLE_WIDTH)   ioffset[GMX_SIMD_DOUBLE_WIDTH];
 304
 305     store(ioffset, simdoffset);
 306     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 307 }
 308
 309 static inline double gmx_simdcall
 310 reduceIncr4ReturnSum(double  *   m,
 311                      SimdDouble  v0,
 312                      SimdDouble  v1,
 313                      SimdDouble  v2,
 314                      SimdDouble  v3)
 315 {
 316     vector4double t0 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(00415));
 317     vector4double t1 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(02637));
 318     vector4double t2 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(00415));
 319     vector4double t3 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(02637));
 320     v0.simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
 321     v1.simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
 322     v2.simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
 323     v3.simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
 324
 325     v0 = v0 + v1;
 326     v2 = v2 + v3;
 327     v0 = v0 + v2;
 328     v2 = v0 + simdLoad(m);
 329     store(m, v2);
 330
 331     return reduce(v0);
 332 }
 333
 334 }      // namespace gmx
 335
 336 #endif // GMX_SIMD_IMPLEMENTATION_IBM_QPX_UTIL_DOUBLE_H