src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstddef>
  43 #include <cstdint>
  44
  45 #include <arm_neon.h>
  46
  47 #include "gromacs/utility/basedefinitions.h"
  48
  49 #include "impl_arm_neon_asimd_simd_double.h"
  50
  51 namespace gmx
  52 {
  53
  54 template <int align>
  55 static gmx_inline void gmx_simdcall
  56 gatherLoadTranspose(const double *        base,
  57                     const std::int32_t    offset[],
  58                     SimdDouble *          v0,
  59                     SimdDouble *          v1,
  60                     SimdDouble *          v2,
  61                     SimdDouble *          v3)
  62 {
  63     float64x2_t t1, t2, t3, t4;
  64
  65     assert(std::size_t(offset) % 8 == 0);
  66     assert(std::size_t(base) % 16 == 0);
  67     assert(align % 2 == 0);
  68
  69     t1                = vld1q_f64(base + align * offset[0]);
  70     t2                = vld1q_f64(base + align * offset[1]);
  71     t3                = vld1q_f64(base + align * offset[0] + 2);
  72     t4                = vld1q_f64(base + align * offset[1] + 2);
  73     v0->simdInternal_ = vuzp1q_f64(t1, t2);
  74     v1->simdInternal_ = vuzp2q_f64(t1, t2);
  75     v2->simdInternal_ = vuzp1q_f64(t3, t4);
  76     v3->simdInternal_ = vuzp2q_f64(t3, t4);
  77 }
  78
  79 template <int align>
  80 static gmx_inline void gmx_simdcall
  81 gatherLoadTranspose(const double *        base,
  82                     const std::int32_t    offset[],
  83                     SimdDouble *          v0,
  84                     SimdDouble *          v1)
  85 {
  86     float64x2_t t1, t2;
  87
  88     assert(std::size_t(offset) % 8 == 0);
  89     assert(std::size_t(base) % 16 == 0);
  90     assert(align % 2 == 0);
  91
  92     t1                = vld1q_f64(base + align * offset[0]);
  93     t2                = vld1q_f64(base + align * offset[1]);
  94     v0->simdInternal_ = vuzp1q_f64(t1, t2);
  95     v1->simdInternal_ = vuzp2q_f64(t1, t2);
  96 }
  97
  98 static const int c_simdBestPairAlignmentDouble = 2;
  99
 100 template <int align>
 101 static gmx_inline void gmx_simdcall
 102 gatherLoadUTranspose(const double *        base,
 103                      const std::int32_t    offset[],
 104                      SimdDouble *          v0,
 105                      SimdDouble *          v1,
 106                      SimdDouble *          v2)
 107 {
 108     float64x2_t t1, t2;
 109     float64x1_t t3, t4;
 110
 111     assert(std::size_t(offset) % 8 == 0);
 112
 113     t1                = vld1q_f64(base + align * offset[0]);
 114     t2                = vld1q_f64(base + align * offset[1]);
 115     t3                = vld1_f64(base + align * offset[0] + 2);
 116     t4                = vld1_f64(base + align * offset[1] + 2);
 117     v0->simdInternal_ = vuzp1q_f64(t1, t2);
 118     v1->simdInternal_ = vuzp2q_f64(t1, t2);
 119     v2->simdInternal_ = vcombine_f64(t3, t4);
 120 }
 121
 122 template <int align>
 123 static gmx_inline void gmx_simdcall
 124 transposeScatterStoreU(double *             base,
 125                        const std::int32_t   offset[],
 126                        SimdDouble           v0,
 127                        SimdDouble           v1,
 128                        SimdDouble           v2)
 129 {
 130     float64x2_t t0, t1;
 131
 132     assert(std::size_t(offset) % 8 == 0);
 133
 134     t0  = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
 135     t1  = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
 136     vst1q_f64(base + align * offset[0], t0);
 137     vst1q_f64(base + align * offset[1], t1);
 138     vst1_f64(base + align * offset[0] + 2, vget_low_f64(v2.simdInternal_));
 139     vst1_f64(base + align * offset[1] + 2, vget_high_f64(v2.simdInternal_));
 140 }
 141
 142 template <int align>
 143 static gmx_inline void gmx_simdcall
 144 transposeScatterIncrU(double *             base,
 145                       const std::int32_t   offset[],
 146                       SimdDouble           v0,
 147                       SimdDouble           v1,
 148                       SimdDouble           v2)
 149 {
 150     float64x2_t t0, t1, t2;
 151     float64x1_t t3;
 152
 153     assert(std::size_t(offset) % 8 == 0);
 154
 155     t0  = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
 156     t1  = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
 157
 158     t2 = vld1q_f64(base + align * offset[0]);
 159     t2 = vaddq_f64(t2, t0);
 160     vst1q_f64(base + align * offset[0], t2);
 161
 162     t3 = vld1_f64(base + align * offset[0] + 2);
 163     t3 = vadd_f64(t3, vget_low_f64(v2.simdInternal_));
 164     vst1_f64(base + align * offset[0] + 2, t3);
 165
 166     t2 = vld1q_f64(base + align * offset[1]);
 167     t2 = vaddq_f64(t2, t1);
 168     vst1q_f64(base + align * offset[1], t2);
 169
 170     t3 = vld1_f64(base + align * offset[1] + 2);
 171     t3 = vadd_f64(t3, vget_high_f64(v2.simdInternal_));
 172     vst1_f64(base + align * offset[1] + 2, t3);
 173 }
 174
 175 template <int align>
 176 static gmx_inline void gmx_simdcall
 177 transposeScatterDecrU(double *             base,
 178                       const std::int32_t   offset[],
 179                       SimdDouble           v0,
 180                       SimdDouble           v1,
 181                       SimdDouble           v2)
 182 {
 183     float64x2_t t0, t1, t2;
 184     float64x1_t t3;
 185
 186     assert(std::size_t(offset) % 8 == 0);
 187
 188     t0  = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
 189     t1  = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
 190
 191     t2 = vld1q_f64(base + align * offset[0]);
 192     t2 = vsubq_f64(t2, t0);
 193     vst1q_f64(base + align * offset[0], t2);
 194
 195     t3 = vld1_f64(base + align * offset[0] + 2);
 196     t3 = vsub_f64(t3, vget_low_f64(v2.simdInternal_));
 197     vst1_f64(base + align * offset[0] + 2, t3);
 198
 199     t2 = vld1q_f64(base + align * offset[1]);
 200     t2 = vsubq_f64(t2, t1);
 201     vst1q_f64(base + align * offset[1], t2);
 202
 203     t3 = vld1_f64(base + align * offset[1] + 2);
 204     t3 = vsub_f64(t3, vget_high_f64(v2.simdInternal_));
 205     vst1_f64(base + align * offset[1] + 2, t3);
 206 }
 207
 208 static gmx_inline void gmx_simdcall
 209 expandScalarsToTriplets(SimdDouble    scalar,
 210                         SimdDouble *  triplets0,
 211                         SimdDouble *  triplets1,
 212                         SimdDouble *  triplets2)
 213 {
 214     triplets0->simdInternal_ = vuzp1q_f64(scalar.simdInternal_, scalar.simdInternal_);
 215     triplets1->simdInternal_ = scalar.simdInternal_;
 216     triplets2->simdInternal_ = vuzp2q_f64(scalar.simdInternal_, scalar.simdInternal_);
 217 }
 218
 219 template <int align>
 220 static gmx_inline void gmx_simdcall
 221 gatherLoadBySimdIntTranspose(const double *  base,
 222                              SimdDInt32      offset,
 223                              SimdDouble *    v0,
 224                              SimdDouble *    v1,
 225                              SimdDouble *    v2,
 226                              SimdDouble *    v3)
 227 {
 228     GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH)  ioffset[GMX_SIMD_DINT32_WIDTH];
 229
 230     assert(std::size_t(base) % 16 == 0);
 231     assert(align % 2 == 0);
 232
 233     vst1_s32(ioffset, offset.simdInternal_);
 234     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 235 }
 236
 237
 238 template <int align>
 239 static gmx_inline void gmx_simdcall
 240 gatherLoadBySimdIntTranspose(const double *  base,
 241                              SimdDInt32      offset,
 242                              SimdDouble *    v0,
 243                              SimdDouble *    v1)
 244 {
 245     GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH)  ioffset[GMX_SIMD_DINT32_WIDTH];
 246
 247     assert(std::size_t(base) % 16 == 0);
 248     assert(align % 2 == 0);
 249
 250     vst1_s32(ioffset, offset.simdInternal_);
 251     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 252 }
 253
 254 template <int align>
 255 static gmx_inline void gmx_simdcall
 256 gatherLoadUBySimdIntTranspose(const double *  base,
 257                               SimdDInt32      offset,
 258                               SimdDouble *    v0,
 259                               SimdDouble *    v1)
 260 {
 261     GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH)  ioffset[GMX_SIMD_DINT32_WIDTH];
 262
 263     vst1_s32(ioffset, offset.simdInternal_);
 264
 265     float64x2_t t1, t2;
 266
 267     t1                = vld1q_f64(base + align * ioffset[0]);
 268     t2                = vld1q_f64(base + align * ioffset[1]);
 269     v0->simdInternal_ = vuzp1q_f64(t1, t2);
 270     v1->simdInternal_ = vuzp2q_f64(t1, t2);
 271 }
 272
 273
 274 static gmx_inline double gmx_simdcall
 275 reduceIncr4ReturnSum(double *    m,
 276                      SimdDouble  v0,
 277                      SimdDouble  v1,
 278                      SimdDouble  v2,
 279                      SimdDouble  v3)
 280 {
 281     float64x2_t t1, t2, t3, t4;
 282
 283     assert(std::size_t(m) % 8 == 0);
 284
 285     t1 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
 286     t2 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
 287     t3 = vuzp1q_f64(v2.simdInternal_, v3.simdInternal_);
 288     t4 = vuzp2q_f64(v2.simdInternal_, v3.simdInternal_);
 289
 290     t1 = vaddq_f64(t1, t2);
 291     t3 = vaddq_f64(t3, t4);
 292
 293     t2 = vaddq_f64(t1, vld1q_f64(m));
 294     t4 = vaddq_f64(t3, vld1q_f64(m + 2));
 295     vst1q_f64(m, t2);
 296     vst1q_f64(m + 2, t4);
 297
 298     t1 = vaddq_f64(t1, t3);
 299     t2 = vpaddq_f64(t1, t1);
 300
 301     return vgetq_lane_f64(t2, 0);
 302 }
 303
 304 }      // namespace gmx
 305
 306 #endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H