src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 #ifndef GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
  37 #define GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstddef>
  43 #include <cstdint>
  44
  45 #include <arm_neon.h>
  46
  47 #include "gromacs/utility/basedefinitions.h"
  48
  49 #include "impl_arm_neon_simd_float.h"
  50
  51
  52 namespace gmx
  53 {
  54
  55 template<int align>
  56 static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
  57                                                     const std::int32_t offset[],
  58                                                     SimdFloat*         v0,
  59                                                     SimdFloat*         v1,
  60                                                     SimdFloat*         v2,
  61                                                     SimdFloat*         v3)
  62 {
  63     assert(std::size_t(offset) % 16 == 0);
  64     assert(std::size_t(base) % 16 == 0);
  65     assert(align % 4 == 0);
  66
  67     // Unfortunately we cannot use the beautiful Neon structured load
  68     // instructions since the data comes from four different memory locations.
  69     float32x4x2_t t0 =
  70             vuzpq_f32(vld1q_f32(base + align * offset[0]), vld1q_f32(base + align * offset[2]));
  71     float32x4x2_t t1 =
  72             vuzpq_f32(vld1q_f32(base + align * offset[1]), vld1q_f32(base + align * offset[3]));
  73     float32x4x2_t t2  = vtrnq_f32(t0.val[0], t1.val[0]);
  74     float32x4x2_t t3  = vtrnq_f32(t0.val[1], t1.val[1]);
  75     v0->simdInternal_ = t2.val[0];
  76     v1->simdInternal_ = t3.val[0];
  77     v2->simdInternal_ = t2.val[1];
  78     v3->simdInternal_ = t3.val[1];
  79 }
  80
  81 template<int align>
  82 static inline void gmx_simdcall
  83                    gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
  84 {
  85     assert(std::size_t(offset) % 16 == 0);
  86     assert(std::size_t(base) % 8 == 0);
  87     assert(align % 2 == 0);
  88
  89     v0->simdInternal_ =
  90             vcombine_f32(vld1_f32(base + align * offset[0]), vld1_f32(base + align * offset[2]));
  91     v1->simdInternal_ =
  92             vcombine_f32(vld1_f32(base + align * offset[1]), vld1_f32(base + align * offset[3]));
  93
  94     float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
  95
  96     v0->simdInternal_ = tmp.val[0];
  97     v1->simdInternal_ = tmp.val[1];
  98 }
  99
 100 static const int c_simdBestPairAlignmentFloat = 2;
 101
 102 template<int align>
 103 static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
 104                                                      const std::int32_t offset[],
 105                                                      SimdFloat*         v0,
 106                                                      SimdFloat*         v1,
 107                                                      SimdFloat*         v2)
 108 {
 109     assert(std::size_t(offset) % 16 == 0);
 110
 111     float32x4x2_t t0 =
 112             vuzpq_f32(vld1q_f32(base + align * offset[0]), vld1q_f32(base + align * offset[2]));
 113     float32x4x2_t t1 =
 114             vuzpq_f32(vld1q_f32(base + align * offset[1]), vld1q_f32(base + align * offset[3]));
 115     float32x4x2_t t2  = vtrnq_f32(t0.val[0], t1.val[0]);
 116     float32x4x2_t t3  = vtrnq_f32(t0.val[1], t1.val[1]);
 117     v0->simdInternal_ = t2.val[0];
 118     v1->simdInternal_ = t3.val[0];
 119     v2->simdInternal_ = t2.val[1];
 120 }
 121
 122
 123 template<int align>
 124 static inline void gmx_simdcall
 125                    transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 126 {
 127     assert(std::size_t(offset) % 16 == 0);
 128
 129     float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
 130
 131     vst1_f32(base + align * offset[0], vget_low_f32(tmp.val[0]));
 132     vst1_f32(base + align * offset[1], vget_low_f32(tmp.val[1]));
 133     vst1_f32(base + align * offset[2], vget_high_f32(tmp.val[0]));
 134     vst1_f32(base + align * offset[3], vget_high_f32(tmp.val[1]));
 135
 136     vst1q_lane_f32(base + align * offset[0] + 2, v2.simdInternal_, 0);
 137     vst1q_lane_f32(base + align * offset[1] + 2, v2.simdInternal_, 1);
 138     vst1q_lane_f32(base + align * offset[2] + 2, v2.simdInternal_, 2);
 139     vst1q_lane_f32(base + align * offset[3] + 2, v2.simdInternal_, 3);
 140 }
 141
 142
 143 template<int align>
 144 static inline void gmx_simdcall
 145                    transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 146 {
 147     assert(std::size_t(offset) % 16 == 0);
 148
 149     if (align < 4)
 150     {
 151         float32x2_t   t0, t1, t2, t3;
 152         float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
 153
 154         t0 = vget_low_f32(tmp.val[0]);
 155         t1 = vget_low_f32(tmp.val[1]);
 156         t2 = vget_high_f32(tmp.val[0]);
 157         t3 = vget_high_f32(tmp.val[1]);
 158
 159         t0 = vadd_f32(t0, vld1_f32(base + align * offset[0]));
 160         vst1_f32(base + align * offset[0], t0);
 161         base[align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0);
 162
 163         t1 = vadd_f32(t1, vld1_f32(base + align * offset[1]));
 164         vst1_f32(base + align * offset[1], t1);
 165         base[align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1);
 166
 167         t2 = vadd_f32(t2, vld1_f32(base + align * offset[2]));
 168         vst1_f32(base + align * offset[2], t2);
 169         base[align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2);
 170
 171         t3 = vadd_f32(t3, vld1_f32(base + align * offset[3]));
 172         vst1_f32(base + align * offset[3], t3);
 173         base[align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3);
 174     }
 175     else
 176     {
 177         // Extra elements means we can use full width-4 load/store operations
 178         float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
 179         float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
 180         float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 181         float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 182         float32x4_t   t4 = t2.val[0];
 183         float32x4_t   t5 = t3.val[0];
 184         float32x4_t   t6 = t2.val[1];
 185         float32x4_t   t7 = t3.val[1];
 186
 187         vst1q_f32(base + align * offset[0], vaddq_f32(t4, vld1q_f32(base + align * offset[0])));
 188         vst1q_f32(base + align * offset[1], vaddq_f32(t5, vld1q_f32(base + align * offset[1])));
 189         vst1q_f32(base + align * offset[2], vaddq_f32(t6, vld1q_f32(base + align * offset[2])));
 190         vst1q_f32(base + align * offset[3], vaddq_f32(t7, vld1q_f32(base + align * offset[3])));
 191     }
 192 }
 193
 194 template<int align>
 195 static inline void gmx_simdcall
 196                    transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 197 {
 198     assert(std::size_t(offset) % 16 == 0);
 199
 200     if (align < 4)
 201     {
 202         float32x2_t   t0, t1, t2, t3;
 203         float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
 204
 205         t0 = vget_low_f32(tmp.val[0]);
 206         t1 = vget_low_f32(tmp.val[1]);
 207         t2 = vget_high_f32(tmp.val[0]);
 208         t3 = vget_high_f32(tmp.val[1]);
 209
 210         t0 = vsub_f32(vld1_f32(base + align * offset[0]), t0);
 211         vst1_f32(base + align * offset[0], t0);
 212         base[align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0);
 213
 214         t1 = vsub_f32(vld1_f32(base + align * offset[1]), t1);
 215         vst1_f32(base + align * offset[1], t1);
 216         base[align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1);
 217
 218         t2 = vsub_f32(vld1_f32(base + align * offset[2]), t2);
 219         vst1_f32(base + align * offset[2], t2);
 220         base[align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2);
 221
 222         t3 = vsub_f32(vld1_f32(base + align * offset[3]), t3);
 223         vst1_f32(base + align * offset[3], t3);
 224         base[align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3);
 225     }
 226     else
 227     {
 228         // Extra elements means we can use full width-4 load/store operations
 229         float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
 230         float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
 231         float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 232         float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 233         float32x4_t   t4 = t2.val[0];
 234         float32x4_t   t5 = t3.val[0];
 235         float32x4_t   t6 = t2.val[1];
 236         float32x4_t   t7 = t3.val[1];
 237
 238         vst1q_f32(base + align * offset[0], vsubq_f32(vld1q_f32(base + align * offset[0]), t4));
 239         vst1q_f32(base + align * offset[1], vsubq_f32(vld1q_f32(base + align * offset[1]), t5));
 240         vst1q_f32(base + align * offset[2], vsubq_f32(vld1q_f32(base + align * offset[2]), t6));
 241         vst1q_f32(base + align * offset[3], vsubq_f32(vld1q_f32(base + align * offset[3]), t7));
 242     }
 243 }
 244
 245 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
 246                                                         SimdFloat* triplets0,
 247                                                         SimdFloat* triplets1,
 248                                                         SimdFloat* triplets2)
 249 {
 250     float32x2_t lo, hi;
 251     float32x4_t t0, t1, t2, t3;
 252
 253     lo = vget_low_f32(scalar.simdInternal_);
 254     hi = vget_high_f32(scalar.simdInternal_);
 255
 256     t0 = vdupq_lane_f32(lo, 0);
 257     t1 = vdupq_lane_f32(lo, 1);
 258     t2 = vdupq_lane_f32(hi, 0);
 259     t3 = vdupq_lane_f32(hi, 1);
 260
 261     triplets0->simdInternal_ = vextq_f32(t0, t1, 1);
 262     triplets1->simdInternal_ = vextq_f32(t1, t2, 2);
 263     triplets2->simdInternal_ = vextq_f32(t2, t3, 3);
 264 }
 265
 266
 267 template<int align>
 268 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
 269                                                              SimdFInt32   offset,
 270                                                              SimdFloat*   v0,
 271                                                              SimdFloat*   v1,
 272                                                              SimdFloat*   v2,
 273                                                              SimdFloat*   v3)
 274 {
 275     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 276
 277     assert(std::size_t(base) % 16 == 0);
 278     assert(align % 4 == 0);
 279
 280     store(ioffset, offset);
 281     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 282 }
 283
 284 template<int align>
 285 static inline void gmx_simdcall
 286                    gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 287 {
 288     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 289
 290     store(ioffset, offset);
 291     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 292 }
 293
 294
 295 template<int align>
 296 static inline void gmx_simdcall
 297                    gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 298 {
 299     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 300
 301     store(ioffset, offset);
 302     v0->simdInternal_ =
 303             vcombine_f32(vld1_f32(base + align * ioffset[0]), vld1_f32(base + align * ioffset[2]));
 304     v1->simdInternal_ =
 305             vcombine_f32(vld1_f32(base + align * ioffset[1]), vld1_f32(base + align * ioffset[3]));
 306     float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
 307     v0->simdInternal_ = tmp.val[0];
 308     v1->simdInternal_ = tmp.val[1];
 309 }
 310
 311 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 312 {
 313     assert(std::size_t(m) % 16 == 0);
 314
 315     float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
 316     float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_);
 317     float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 318     float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 319     v0.simdInternal_ = t2.val[0];
 320     v1.simdInternal_ = t3.val[0];
 321     v2.simdInternal_ = t2.val[1];
 322     v3.simdInternal_ = t3.val[1];
 323
 324     v0 = v0 + v1;
 325     v2 = v2 + v3;
 326     v0 = v0 + v2;
 327     v2 = v0 + simdLoad(m);
 328     store(m, v2);
 329
 330     return reduce(v0);
 331 }
 332
 333 } // namespace gmx
 334
 335 #endif // GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H