src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstddef>
  43 #include <cstdint>
  44
  45 #include <arm_neon.h>
  46
  47 #include "impl_arm_neon_asimd_simd_double.h"
  48
  49 namespace gmx
  50 {
  51
  52 template<int align>
  53 static inline void gmx_simdcall gatherLoadTranspose(const double*      base,
  54                                                     const std::int32_t offset[],
  55                                                     SimdDouble*        v0,
  56                                                     SimdDouble*        v1,
  57                                                     SimdDouble*        v2,
  58                                                     SimdDouble*        v3)
  59 {
  60     float64x2_t t1, t2, t3, t4;
  61
  62     assert(std::size_t(offset) % 8 == 0);
  63     assert(std::size_t(base) % 16 == 0);
  64     assert(align % 2 == 0);
  65
  66     t1                = vld1q_f64(base + align * offset[0]);
  67     t2                = vld1q_f64(base + align * offset[1]);
  68     t3                = vld1q_f64(base + align * offset[0] + 2);
  69     t4                = vld1q_f64(base + align * offset[1] + 2);
  70     v0->simdInternal_ = vuzp1q_f64(t1, t2);
  71     v1->simdInternal_ = vuzp2q_f64(t1, t2);
  72     v2->simdInternal_ = vuzp1q_f64(t3, t4);
  73     v3->simdInternal_ = vuzp2q_f64(t3, t4);
  74 }
  75
  76 template<int align>
  77 static inline void gmx_simdcall
  78                    gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
  79 {
  80     float64x2_t t1, t2;
  81
  82     assert(std::size_t(offset) % 8 == 0);
  83     assert(std::size_t(base) % 16 == 0);
  84     assert(align % 2 == 0);
  85
  86     t1                = vld1q_f64(base + align * offset[0]);
  87     t2                = vld1q_f64(base + align * offset[1]);
  88     v0->simdInternal_ = vuzp1q_f64(t1, t2);
  89     v1->simdInternal_ = vuzp2q_f64(t1, t2);
  90 }
  91
  92 static const int c_simdBestPairAlignmentDouble = 2;
  93
  94 template<int align>
  95 static inline void gmx_simdcall gatherLoadUTranspose(const double*      base,
  96                                                      const std::int32_t offset[],
  97                                                      SimdDouble*        v0,
  98                                                      SimdDouble*        v1,
  99                                                      SimdDouble*        v2)
 100 {
 101     float64x2_t t1, t2;
 102     float64x1_t t3, t4;
 103
 104     assert(std::size_t(offset) % 8 == 0);
 105
 106     t1                = vld1q_f64(base + align * offset[0]);
 107     t2                = vld1q_f64(base + align * offset[1]);
 108     t3                = vld1_f64(base + align * offset[0] + 2);
 109     t4                = vld1_f64(base + align * offset[1] + 2);
 110     v0->simdInternal_ = vuzp1q_f64(t1, t2);
 111     v1->simdInternal_ = vuzp2q_f64(t1, t2);
 112     v2->simdInternal_ = vcombine_f64(t3, t4);
 113 }
 114
 115 template<int align>
 116 static inline void gmx_simdcall transposeScatterStoreU(double*            base,
 117                                                        const std::int32_t offset[],
 118                                                        SimdDouble         v0,
 119                                                        SimdDouble         v1,
 120                                                        SimdDouble         v2)
 121 {
 122     float64x2_t t0, t1;
 123
 124     assert(std::size_t(offset) % 8 == 0);
 125
 126     t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
 127     t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
 128     vst1q_f64(base + align * offset[0], t0);
 129     vst1q_f64(base + align * offset[1], t1);
 130     vst1_f64(base + align * offset[0] + 2, vget_low_f64(v2.simdInternal_));
 131     vst1_f64(base + align * offset[1] + 2, vget_high_f64(v2.simdInternal_));
 132 }
 133
 134 template<int align>
 135 static inline void gmx_simdcall
 136                    transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 137 {
 138     float64x2_t t0, t1, t2;
 139     float64x1_t t3;
 140
 141     assert(std::size_t(offset) % 8 == 0);
 142
 143     t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
 144     t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
 145
 146     t2 = vld1q_f64(base + align * offset[0]);
 147     t2 = vaddq_f64(t2, t0);
 148     vst1q_f64(base + align * offset[0], t2);
 149
 150     t3 = vld1_f64(base + align * offset[0] + 2);
 151     t3 = vadd_f64(t3, vget_low_f64(v2.simdInternal_));
 152     vst1_f64(base + align * offset[0] + 2, t3);
 153
 154     t2 = vld1q_f64(base + align * offset[1]);
 155     t2 = vaddq_f64(t2, t1);
 156     vst1q_f64(base + align * offset[1], t2);
 157
 158     t3 = vld1_f64(base + align * offset[1] + 2);
 159     t3 = vadd_f64(t3, vget_high_f64(v2.simdInternal_));
 160     vst1_f64(base + align * offset[1] + 2, t3);
 161 }
 162
 163 template<int align>
 164 static inline void gmx_simdcall
 165                    transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 166 {
 167     float64x2_t t0, t1, t2;
 168     float64x1_t t3;
 169
 170     assert(std::size_t(offset) % 8 == 0);
 171
 172     t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
 173     t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
 174
 175     t2 = vld1q_f64(base + align * offset[0]);
 176     t2 = vsubq_f64(t2, t0);
 177     vst1q_f64(base + align * offset[0], t2);
 178
 179     t3 = vld1_f64(base + align * offset[0] + 2);
 180     t3 = vsub_f64(t3, vget_low_f64(v2.simdInternal_));
 181     vst1_f64(base + align * offset[0] + 2, t3);
 182
 183     t2 = vld1q_f64(base + align * offset[1]);
 184     t2 = vsubq_f64(t2, t1);
 185     vst1q_f64(base + align * offset[1], t2);
 186
 187     t3 = vld1_f64(base + align * offset[1] + 2);
 188     t3 = vsub_f64(t3, vget_high_f64(v2.simdInternal_));
 189     vst1_f64(base + align * offset[1] + 2, t3);
 190 }
 191
 192 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble  scalar,
 193                                                         SimdDouble* triplets0,
 194                                                         SimdDouble* triplets1,
 195                                                         SimdDouble* triplets2)
 196 {
 197     triplets0->simdInternal_ = vuzp1q_f64(scalar.simdInternal_, scalar.simdInternal_);
 198     triplets1->simdInternal_ = scalar.simdInternal_;
 199     triplets2->simdInternal_ = vuzp2q_f64(scalar.simdInternal_, scalar.simdInternal_);
 200 }
 201
 202 template<int align>
 203 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
 204                                                              SimdDInt32    offset,
 205                                                              SimdDouble*   v0,
 206                                                              SimdDouble*   v1,
 207                                                              SimdDouble*   v2,
 208                                                              SimdDouble*   v3)
 209 {
 210     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 211
 212     assert(std::size_t(base) % 16 == 0);
 213     assert(align % 2 == 0);
 214
 215     vst1_s32(ioffset, offset.simdInternal_);
 216     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 217 }
 218
 219
 220 template<int align>
 221 static inline void gmx_simdcall
 222                    gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 223 {
 224     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 225
 226     assert(std::size_t(base) % 16 == 0);
 227     assert(align % 2 == 0);
 228
 229     vst1_s32(ioffset, offset.simdInternal_);
 230     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 231 }
 232
 233 template<int align>
 234 static inline void gmx_simdcall
 235                    gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 236 {
 237     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 238
 239     vst1_s32(ioffset, offset.simdInternal_);
 240
 241     float64x2_t t1, t2;
 242
 243     t1                = vld1q_f64(base + align * ioffset[0]);
 244     t2                = vld1q_f64(base + align * ioffset[1]);
 245     v0->simdInternal_ = vuzp1q_f64(t1, t2);
 246     v1->simdInternal_ = vuzp2q_f64(t1, t2);
 247 }
 248
 249
 250 static inline double gmx_simdcall
 251                      reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
 252 {
 253     float64x2_t t1, t2, t3, t4;
 254
 255     assert(std::size_t(m) % 8 == 0);
 256
 257     t1 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
 258     t2 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
 259     t3 = vuzp1q_f64(v2.simdInternal_, v3.simdInternal_);
 260     t4 = vuzp2q_f64(v2.simdInternal_, v3.simdInternal_);
 261
 262     t1 = vaddq_f64(t1, t2);
 263     t3 = vaddq_f64(t3, t4);
 264
 265     t2 = vaddq_f64(t1, vld1q_f64(m));
 266     t4 = vaddq_f64(t3, vld1q_f64(m + 2));
 267     vst1q_f64(m, t2);
 268     vst1q_f64(m + 2, t4);
 269
 270     t1 = vaddq_f64(t1, t3);
 271     t2 = vpaddq_f64(t1, t1);
 272
 273     return vgetq_lane_f64(t2, 0);
 274 }
 275
 276 } // namespace gmx
 277
 278 #endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H