2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2020 Research Organization for Information Science and Technology (RIST).
5 * Copyright (c) 2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 * armv8+sve support to GROMACS was contributed by the Research Organization for
39 * Information Science and Technology (RIST).
42 #ifndef GMX_SIMD_IMPL_ARM_SVE_UTIL_DOUBLE_H
43 #define GMX_SIMD_IMPL_ARM_SVE_UTIL_DOUBLE_H
53 #include "gromacs/utility/basedefinitions.h"
55 #include "impl_arm_sve_simd_double.h"
63 inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
65 // Make sure the memory pointer is aligned to half float SIMD width
66 assert(std::size_t(m) % 32 == 0);
68 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
69 svfloat64_t v0, v1, v2, v3;
70 v0 = svld1_f64(pg, m);
71 v1 = svext_f64(a.simdInternal_, a.simdInternal_, GMX_SIMD_DOUBLE_WIDTH / 2);
72 v2 = svadd_f64_x(pg, a.simdInternal_, v1);
73 v3 = svsub_f64_x(pg, v0, v2);
79 static inline void gmx_simdcall gatherLoadTranspose(const double* base,
80 const std::int32_t offset[],
86 assert(std::size_t(offset) % 16 == 0);
87 assert(std::size_t(base) % 64 == 0);
88 assert(align % 4 == 0);
91 svbool_t pg = svptrue_b64();
92 offsets = svmul_n_s64_x(
94 svunpklo_s64(svld1_s32(svwhilelt_b32(0, (int32_t)GMX_SIMD_DINT32_WIDTH), offset)),
95 align * sizeof(double));
96 v0->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
97 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
98 v1->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
99 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
100 v2->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
101 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
102 v3->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
106 static inline void gmx_simdcall
107 gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
109 // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
110 assert(std::size_t(base) % 8 == 0);
111 // align parameter must also be a multiple of the above alignment requirement
112 assert(align % 2 == 0);
114 svbool_t pg = svptrue_b64();
116 offsets = svmul_n_s64_x(pg, offset.simdInternal_, align * sizeof(double));
117 v0->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
118 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
119 v1->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
123 static inline void gmx_simdcall
124 gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
126 assert(std::size_t(offset) % 64 == 0);
127 assert(std::size_t(base) % 8 == 0);
128 assert(align % 2 == 0);
131 svbool_t pg = svwhilelt_b32(0, (int32_t)GMX_SIMD_DINT32_WIDTH);
132 offsets.simdInternal_ = svunpklo_s64(svld1_s32(pg, offset));
133 gatherLoadBySimdIntTranspose<align>(base, offsets, v0, v1);
136 static const int c_simdBestPairAlignmentDouble = 2;
139 static inline void gmx_simdcall gatherLoadUTranspose(const double* base,
140 const std::int32_t offset[],
145 assert(std::size_t(offset) % 16 == 0);
148 svbool_t pg = svptrue_b64();
149 offsets = svmul_n_s64_x(
151 svunpklo_s64(svld1_s32(svwhilelt_b32(0, (int32_t)GMX_SIMD_DINT32_WIDTH), offset)),
152 align * sizeof(double));
153 v0->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
154 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
155 v1->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
156 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
157 v2->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
162 static inline void gmx_simdcall transposeScatterStoreU(double* base,
163 const std::int32_t offset[],
168 assert(std::size_t(offset) % 16 == 0);
171 svbool_t pg = svptrue_b64();
172 offsets = svmul_n_s64_x(
174 svunpklo_s64(svld1_s32(svwhilelt_b32(0, (int32_t)GMX_SIMD_DINT32_WIDTH), offset)),
175 align * sizeof(double));
176 svst1_scatter_s64offset_f64(pg, base, offsets, v0.simdInternal_);
177 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
178 svst1_scatter_s64offset_f64(pg, base, offsets, v1.simdInternal_);
179 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
180 svst1_scatter_s64offset_f64(pg, base, offsets, v2.simdInternal_);
185 static inline void gmx_simdcall
186 transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
188 assert(std::size_t(offset) % 32 == 0);
190 svbool_t pg = svptrue_b64();
192 alignas(GMX_SIMD_ALIGNMENT) double tvec[3 * GMX_SIMD_DOUBLE_WIDTH];
193 v = svcreate3_f64(v0.simdInternal_, v1.simdInternal_, v2.simdInternal_);
194 svst3_f64(pg, tvec, v);
195 #if GMX_SIMD_DOUBLE_WIDTH >= 3
196 pg = svwhilelt_b64(0, 3);
197 for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
199 svfloat64_t t1 = svld1_f64(pg, base + align * offset[i]);
200 svfloat64_t t2 = svld1_f64(pg, tvec + 3 * i);
201 svfloat64_t t3 = svadd_f64_x(pg, t1, t2);
202 svst1_f64(pg, base + align * offset[i], t3);
205 for (std::size_t i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
207 for (int j = 0; j < 3; j++)
209 base[align * offset[i] + j] += tvec[i * 3 + j];
216 static inline void gmx_simdcall
217 transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
219 assert(std::size_t(offset) % 16 == 0);
221 svbool_t pg = svptrue_b64();
223 alignas(GMX_SIMD_ALIGNMENT) double tvec[3 * GMX_SIMD_DOUBLE_WIDTH];
224 v = svcreate3_f64(v0.simdInternal_, v1.simdInternal_, v2.simdInternal_);
225 svst3_f64(pg, tvec, v);
226 #if GMX_SIMD_DOUBLE_WIDTH >= 3
227 pg = svwhilelt_b64(0, 3);
228 for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
230 svfloat64_t t1 = svld1_f64(pg, base + align * offset[i]);
231 svfloat64_t t2 = svld1_f64(pg, tvec + 3 * i);
232 svfloat64_t t3 = svsub_f64_x(pg, t1, t2);
233 svst1_f64(pg, base + align * offset[i], t3);
236 for (std::size_t i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
238 for (int j = 0; j < 3; j++)
240 base[align * offset[i] + j] -= tvec[i * 3 + j];
246 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble scalar,
247 SimdDouble* triplets0,
248 SimdDouble* triplets1,
249 SimdDouble* triplets2)
251 assert(GMX_SIMD_DOUBLE_WIDTH <= 16);
252 uint64_t ind[48] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5,
253 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10,
254 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15 };
259 idx = svld1_u64(pg, ind);
260 triplets0->simdInternal_ = svtbl_f64(scalar.simdInternal_, idx);
261 idx = svld1_u64(pg, ind + GMX_SIMD_DOUBLE_WIDTH);
262 triplets1->simdInternal_ = svtbl_f64(scalar.simdInternal_, idx);
263 idx = svld1_u64(pg, ind + 2 * GMX_SIMD_DOUBLE_WIDTH);
264 triplets2->simdInternal_ = svtbl_f64(scalar.simdInternal_, idx);
268 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
275 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
277 assert(std::size_t(base) % 16 == 0);
278 assert(align % 4 == 0);
280 store(ioffset, offset);
281 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
286 static inline void gmx_simdcall
287 gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
289 svbool_t pg = svptrue_b64();
290 svint64_t offsets = svmul_n_s64_x(pg, offset.simdInternal_, align * sizeof(double));
291 v0->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
292 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
293 v1->simdInternal_ = svld1_gather_s64offset_f64(pg, base, offsets);
296 static inline double gmx_simdcall
297 reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
299 assert(std::size_t(m) % 16 == 0);
300 svbool_t pg = svptrue_b64();
303 sum[0] = svadda_f64(pg, 0.0, v0.simdInternal_);
304 sum[1] = svadda_f64(pg, 0.0, v1.simdInternal_);
305 sum[2] = svadda_f64(pg, 0.0, v2.simdInternal_);
306 sum[3] = svadda_f64(pg, 0.0, v3.simdInternal_);
307 #if GMX_SIMD_DOUBLE_WIDTH >= 4
308 pg = svwhilelt_b64(0, 4);
309 _m = svld1_f64(pg, m);
310 _s = svld1_f64(pg, sum);
311 svst1_f64(pg, m, svadd_f64_x(pg, _m, _s));
312 return svadda_f64(pg, 0.0, _s);
315 for (int i = 0; i < 4; i++)
324 static inline SimdDouble gmx_simdcall loadDualHsimd(const double* m0, const double* m1)
327 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
328 v0 = svld1_f64(pg, m0);
329 v1 = svld1_f64(pg, m1);
330 return { svsplice_f64(pg, v0, v1) };
333 static inline SimdDouble gmx_simdcall loadDuplicateHsimd(const double* m)
336 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
337 v = svld1_f64(pg, m);
338 return { svsplice_f64(pg, v, v) };
341 static inline SimdDouble gmx_simdcall loadU1DualHsimd(const double* m)
344 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
345 v0 = svdup_f64(m[0]);
346 v1 = svdup_f64(m[1]);
347 return { svsplice_f64(pg, v0, v1) };
350 static inline void gmx_simdcall storeDualHsimd(double* m0, double* m1, SimdDouble a)
352 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
353 svst1_f64(pg, m0, a.simdInternal_);
354 pg = sveor_b_z(svptrue_b64(), pg, svptrue_b64());
355 svst1_f64(pg, m1 - GMX_SIMD_DOUBLE_WIDTH / 2, a.simdInternal_);
358 static inline void gmx_simdcall incrDualHsimd(double* m0, double* m1, SimdDouble a)
360 // Make sure the memory pointer is aligned to half float SIMD width
361 assert(std::size_t(m0) % 32 == 0);
362 assert(std::size_t(m1) % 32 == 0);
364 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
365 svfloat64_t v0, v2, v3;
366 v0 = svld1_f64(pg, m0);
367 v2 = svadd_f64_x(pg, v0, a.simdInternal_);
368 svst1_f64(pg, m0, v2);
369 v0 = svld1_f64(pg, m1);
370 v3 = svext_f64(a.simdInternal_, a.simdInternal_, GMX_SIMD_DOUBLE_WIDTH / 2);
371 v2 = svadd_f64_x(pg, v0, v3);
372 svst1_f64(pg, m1, v2);
375 static inline void gmx_simdcall decr3Hsimd(double* m, SimdDouble a0, SimdDouble a1, SimdDouble a2)
378 decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH / 2, a1);
379 decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH, a2);
382 static inline double gmx_simdcall reduceIncr4ReturnSumHsimd(double* m, SimdDouble v0, SimdDouble v1)
384 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
387 sum[0] = svadda_f64(pg, 0.0, v0.simdInternal_);
388 sum[2] = svadda_f64(pg, 0.0, v1.simdInternal_);
389 pg = sveor_b_z(svptrue_b64(), pg, svptrue_b64());
390 sum[1] = svadda_f64(pg, 0.0, v0.simdInternal_);
391 sum[3] = svadda_f64(pg, 0.0, v1.simdInternal_);
393 #if GMX_SIMD_DOUBLE_WIDTH >= 4
394 pg = svwhilelt_b64(0, 4);
395 _m = svld1_f64(pg, m);
396 _s = svld1_f64(pg, sum);
397 svst1_f64(pg, m, svadd_f64_x(pg, _m, _s));
398 return svadda_f64(pg, 0.0, _s);
401 for (int i = 0; i < 4; i++)
411 static inline void gmx_simdcall gatherLoadTransposeHsimd(const double* base0,
413 const std::int32_t offset[],
418 svbool_t pg = svwhilelt_b64(0, (int32_t)GMX_SIMD_DOUBLE_WIDTH / 2);
419 svfloat64_t _v0, _v1;
420 offsets = svmul_n_s64_x(
422 svunpklo(svld1_s32(svwhilelt_b32(0, (int32_t)GMX_SIMD_DINT32_WIDTH / 2), offset)),
423 align * sizeof(double));
424 _v0 = svld1_gather_s64offset_f64(pg, base0, offsets);
425 _v1 = svld1_gather_s64offset_f64(pg, base1, offsets);
426 v0->simdInternal_ = svsplice_f64(pg, _v0, _v1);
427 offsets = svadd_n_s64_x(pg, offsets, sizeof(double));
428 _v0 = svld1_gather_s64offset_f64(pg, base0, offsets);
429 _v1 = svld1_gather_s64offset_f64(pg, base1, offsets);
430 v1->simdInternal_ = svsplice_f64(pg, _v0, _v1);
435 #endif // GMX_SIMD_IMPL_ARM_SVE_UTIL_DOUBLE_H