2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
47 #include "gromacs/utility/basedefinitions.h"
49 #include "impl_arm_neon_simd_float.h"
56 static inline void gmx_simdcall gatherLoadTranspose(const float* base,
57 const std::int32_t offset[],
63 assert(std::size_t(offset) % 16 == 0);
64 assert(std::size_t(base) % 16 == 0);
65 assert(align % 4 == 0);
67 // Unfortunately we cannot use the beautiful Neon structured load
68 // instructions since the data comes from four different memory locations.
70 vuzpq_f32(vld1q_f32(base + align * offset[0]), vld1q_f32(base + align * offset[2]));
72 vuzpq_f32(vld1q_f32(base + align * offset[1]), vld1q_f32(base + align * offset[3]));
73 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
74 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
75 v0->simdInternal_ = t2.val[0];
76 v1->simdInternal_ = t3.val[0];
77 v2->simdInternal_ = t2.val[1];
78 v3->simdInternal_ = t3.val[1];
82 static inline void gmx_simdcall
83 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
85 assert(std::size_t(offset) % 16 == 0);
86 assert(std::size_t(base) % 8 == 0);
87 assert(align % 2 == 0);
90 vcombine_f32(vld1_f32(base + align * offset[0]), vld1_f32(base + align * offset[2]));
92 vcombine_f32(vld1_f32(base + align * offset[1]), vld1_f32(base + align * offset[3]));
94 float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
96 v0->simdInternal_ = tmp.val[0];
97 v1->simdInternal_ = tmp.val[1];
100 static const int c_simdBestPairAlignmentFloat = 2;
103 static inline void gmx_simdcall gatherLoadUTranspose(const float* base,
104 const std::int32_t offset[],
109 assert(std::size_t(offset) % 16 == 0);
112 vuzpq_f32(vld1q_f32(base + align * offset[0]), vld1q_f32(base + align * offset[2]));
114 vuzpq_f32(vld1q_f32(base + align * offset[1]), vld1q_f32(base + align * offset[3]));
115 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
116 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
117 v0->simdInternal_ = t2.val[0];
118 v1->simdInternal_ = t3.val[0];
119 v2->simdInternal_ = t2.val[1];
124 static inline void gmx_simdcall
125 transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
127 assert(std::size_t(offset) % 16 == 0);
129 float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
131 vst1_f32(base + align * offset[0], vget_low_f32(tmp.val[0]));
132 vst1_f32(base + align * offset[1], vget_low_f32(tmp.val[1]));
133 vst1_f32(base + align * offset[2], vget_high_f32(tmp.val[0]));
134 vst1_f32(base + align * offset[3], vget_high_f32(tmp.val[1]));
136 vst1q_lane_f32(base + align * offset[0] + 2, v2.simdInternal_, 0);
137 vst1q_lane_f32(base + align * offset[1] + 2, v2.simdInternal_, 1);
138 vst1q_lane_f32(base + align * offset[2] + 2, v2.simdInternal_, 2);
139 vst1q_lane_f32(base + align * offset[3] + 2, v2.simdInternal_, 3);
144 static inline void gmx_simdcall
145 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
147 assert(std::size_t(offset) % 16 == 0);
151 float32x2_t t0, t1, t2, t3;
152 float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
154 t0 = vget_low_f32(tmp.val[0]);
155 t1 = vget_low_f32(tmp.val[1]);
156 t2 = vget_high_f32(tmp.val[0]);
157 t3 = vget_high_f32(tmp.val[1]);
159 t0 = vadd_f32(t0, vld1_f32(base + align * offset[0]));
160 vst1_f32(base + align * offset[0], t0);
161 base[align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0);
163 t1 = vadd_f32(t1, vld1_f32(base + align * offset[1]));
164 vst1_f32(base + align * offset[1], t1);
165 base[align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1);
167 t2 = vadd_f32(t2, vld1_f32(base + align * offset[2]));
168 vst1_f32(base + align * offset[2], t2);
169 base[align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2);
171 t3 = vadd_f32(t3, vld1_f32(base + align * offset[3]));
172 vst1_f32(base + align * offset[3], t3);
173 base[align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3);
177 // Extra elements means we can use full width-4 load/store operations
178 float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
179 float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
180 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
181 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
182 float32x4_t t4 = t2.val[0];
183 float32x4_t t5 = t3.val[0];
184 float32x4_t t6 = t2.val[1];
185 float32x4_t t7 = t3.val[1];
187 vst1q_f32(base + align * offset[0], vaddq_f32(t4, vld1q_f32(base + align * offset[0])));
188 vst1q_f32(base + align * offset[1], vaddq_f32(t5, vld1q_f32(base + align * offset[1])));
189 vst1q_f32(base + align * offset[2], vaddq_f32(t6, vld1q_f32(base + align * offset[2])));
190 vst1q_f32(base + align * offset[3], vaddq_f32(t7, vld1q_f32(base + align * offset[3])));
195 static inline void gmx_simdcall
196 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
198 assert(std::size_t(offset) % 16 == 0);
202 float32x2_t t0, t1, t2, t3;
203 float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
205 t0 = vget_low_f32(tmp.val[0]);
206 t1 = vget_low_f32(tmp.val[1]);
207 t2 = vget_high_f32(tmp.val[0]);
208 t3 = vget_high_f32(tmp.val[1]);
210 t0 = vsub_f32(vld1_f32(base + align * offset[0]), t0);
211 vst1_f32(base + align * offset[0], t0);
212 base[align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0);
214 t1 = vsub_f32(vld1_f32(base + align * offset[1]), t1);
215 vst1_f32(base + align * offset[1], t1);
216 base[align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1);
218 t2 = vsub_f32(vld1_f32(base + align * offset[2]), t2);
219 vst1_f32(base + align * offset[2], t2);
220 base[align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2);
222 t3 = vsub_f32(vld1_f32(base + align * offset[3]), t3);
223 vst1_f32(base + align * offset[3], t3);
224 base[align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3);
228 // Extra elements means we can use full width-4 load/store operations
229 float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
230 float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
231 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
232 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
233 float32x4_t t4 = t2.val[0];
234 float32x4_t t5 = t3.val[0];
235 float32x4_t t6 = t2.val[1];
236 float32x4_t t7 = t3.val[1];
238 vst1q_f32(base + align * offset[0], vsubq_f32(vld1q_f32(base + align * offset[0]), t4));
239 vst1q_f32(base + align * offset[1], vsubq_f32(vld1q_f32(base + align * offset[1]), t5));
240 vst1q_f32(base + align * offset[2], vsubq_f32(vld1q_f32(base + align * offset[2]), t6));
241 vst1q_f32(base + align * offset[3], vsubq_f32(vld1q_f32(base + align * offset[3]), t7));
245 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat scalar,
246 SimdFloat* triplets0,
247 SimdFloat* triplets1,
248 SimdFloat* triplets2)
251 float32x4_t t0, t1, t2, t3;
253 lo = vget_low_f32(scalar.simdInternal_);
254 hi = vget_high_f32(scalar.simdInternal_);
256 t0 = vdupq_lane_f32(lo, 0);
257 t1 = vdupq_lane_f32(lo, 1);
258 t2 = vdupq_lane_f32(hi, 0);
259 t3 = vdupq_lane_f32(hi, 1);
261 triplets0->simdInternal_ = vextq_f32(t0, t1, 1);
262 triplets1->simdInternal_ = vextq_f32(t1, t2, 2);
263 triplets2->simdInternal_ = vextq_f32(t2, t3, 3);
268 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
275 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
277 assert(std::size_t(base) % 16 == 0);
278 assert(align % 4 == 0);
280 store(ioffset, offset);
281 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
285 static inline void gmx_simdcall
286 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
288 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
290 store(ioffset, offset);
291 gatherLoadTranspose<align>(base, ioffset, v0, v1);
296 static inline void gmx_simdcall
297 gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
299 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
301 store(ioffset, offset);
303 vcombine_f32(vld1_f32(base + align * ioffset[0]), vld1_f32(base + align * ioffset[2]));
305 vcombine_f32(vld1_f32(base + align * ioffset[1]), vld1_f32(base + align * ioffset[3]));
306 float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
307 v0->simdInternal_ = tmp.val[0];
308 v1->simdInternal_ = tmp.val[1];
311 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
313 assert(std::size_t(m) % 16 == 0);
315 float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
316 float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_);
317 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
318 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
319 v0.simdInternal_ = t2.val[0];
320 v1.simdInternal_ = t3.val[0];
321 v2.simdInternal_ = t2.val[1];
322 v3.simdInternal_ = t3.val[1];
327 v2 = v0 + simdLoad(m);
335 #endif // GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H