2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
47 #include "impl_arm_neon_asimd_simd_double.h"
53 static inline void gmx_simdcall gatherLoadTranspose(const double* base,
54 const std::int32_t offset[],
60 float64x2_t t1, t2, t3, t4;
62 assert(std::size_t(offset) % 8 == 0);
63 assert(std::size_t(base) % 16 == 0);
64 assert(align % 2 == 0);
66 t1 = vld1q_f64(base + align * offset[0]);
67 t2 = vld1q_f64(base + align * offset[1]);
68 t3 = vld1q_f64(base + align * offset[0] + 2);
69 t4 = vld1q_f64(base + align * offset[1] + 2);
70 v0->simdInternal_ = vuzp1q_f64(t1, t2);
71 v1->simdInternal_ = vuzp2q_f64(t1, t2);
72 v2->simdInternal_ = vuzp1q_f64(t3, t4);
73 v3->simdInternal_ = vuzp2q_f64(t3, t4);
77 static inline void gmx_simdcall
78 gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
82 assert(std::size_t(offset) % 8 == 0);
83 assert(std::size_t(base) % 16 == 0);
84 assert(align % 2 == 0);
86 t1 = vld1q_f64(base + align * offset[0]);
87 t2 = vld1q_f64(base + align * offset[1]);
88 v0->simdInternal_ = vuzp1q_f64(t1, t2);
89 v1->simdInternal_ = vuzp2q_f64(t1, t2);
92 static const int c_simdBestPairAlignmentDouble = 2;
95 static inline void gmx_simdcall gatherLoadUTranspose(const double* base,
96 const std::int32_t offset[],
104 assert(std::size_t(offset) % 8 == 0);
106 t1 = vld1q_f64(base + align * offset[0]);
107 t2 = vld1q_f64(base + align * offset[1]);
108 t3 = vld1_f64(base + align * offset[0] + 2);
109 t4 = vld1_f64(base + align * offset[1] + 2);
110 v0->simdInternal_ = vuzp1q_f64(t1, t2);
111 v1->simdInternal_ = vuzp2q_f64(t1, t2);
112 v2->simdInternal_ = vcombine_f64(t3, t4);
116 static inline void gmx_simdcall transposeScatterStoreU(double* base,
117 const std::int32_t offset[],
124 assert(std::size_t(offset) % 8 == 0);
126 t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
127 t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
128 vst1q_f64(base + align * offset[0], t0);
129 vst1q_f64(base + align * offset[1], t1);
130 vst1_f64(base + align * offset[0] + 2, vget_low_f64(v2.simdInternal_));
131 vst1_f64(base + align * offset[1] + 2, vget_high_f64(v2.simdInternal_));
135 static inline void gmx_simdcall
136 transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
138 float64x2_t t0, t1, t2;
141 assert(std::size_t(offset) % 8 == 0);
143 t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
144 t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
146 t2 = vld1q_f64(base + align * offset[0]);
147 t2 = vaddq_f64(t2, t0);
148 vst1q_f64(base + align * offset[0], t2);
150 t3 = vld1_f64(base + align * offset[0] + 2);
151 t3 = vadd_f64(t3, vget_low_f64(v2.simdInternal_));
152 vst1_f64(base + align * offset[0] + 2, t3);
154 t2 = vld1q_f64(base + align * offset[1]);
155 t2 = vaddq_f64(t2, t1);
156 vst1q_f64(base + align * offset[1], t2);
158 t3 = vld1_f64(base + align * offset[1] + 2);
159 t3 = vadd_f64(t3, vget_high_f64(v2.simdInternal_));
160 vst1_f64(base + align * offset[1] + 2, t3);
164 static inline void gmx_simdcall
165 transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
167 float64x2_t t0, t1, t2;
170 assert(std::size_t(offset) % 8 == 0);
172 t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
173 t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
175 t2 = vld1q_f64(base + align * offset[0]);
176 t2 = vsubq_f64(t2, t0);
177 vst1q_f64(base + align * offset[0], t2);
179 t3 = vld1_f64(base + align * offset[0] + 2);
180 t3 = vsub_f64(t3, vget_low_f64(v2.simdInternal_));
181 vst1_f64(base + align * offset[0] + 2, t3);
183 t2 = vld1q_f64(base + align * offset[1]);
184 t2 = vsubq_f64(t2, t1);
185 vst1q_f64(base + align * offset[1], t2);
187 t3 = vld1_f64(base + align * offset[1] + 2);
188 t3 = vsub_f64(t3, vget_high_f64(v2.simdInternal_));
189 vst1_f64(base + align * offset[1] + 2, t3);
192 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble scalar,
193 SimdDouble* triplets0,
194 SimdDouble* triplets1,
195 SimdDouble* triplets2)
197 triplets0->simdInternal_ = vuzp1q_f64(scalar.simdInternal_, scalar.simdInternal_);
198 triplets1->simdInternal_ = scalar.simdInternal_;
199 triplets2->simdInternal_ = vuzp2q_f64(scalar.simdInternal_, scalar.simdInternal_);
203 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
210 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
212 assert(std::size_t(base) % 16 == 0);
213 assert(align % 2 == 0);
215 vst1_s32(ioffset, offset.simdInternal_);
216 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
221 static inline void gmx_simdcall
222 gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
224 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
226 assert(std::size_t(base) % 16 == 0);
227 assert(align % 2 == 0);
229 vst1_s32(ioffset, offset.simdInternal_);
230 gatherLoadTranspose<align>(base, ioffset, v0, v1);
234 static inline void gmx_simdcall
235 gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
237 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
239 vst1_s32(ioffset, offset.simdInternal_);
243 t1 = vld1q_f64(base + align * ioffset[0]);
244 t2 = vld1q_f64(base + align * ioffset[1]);
245 v0->simdInternal_ = vuzp1q_f64(t1, t2);
246 v1->simdInternal_ = vuzp2q_f64(t1, t2);
250 static inline double gmx_simdcall
251 reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
253 float64x2_t t1, t2, t3, t4;
255 assert(std::size_t(m) % 8 == 0);
257 t1 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
258 t2 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
259 t3 = vuzp1q_f64(v2.simdInternal_, v3.simdInternal_);
260 t4 = vuzp2q_f64(v2.simdInternal_, v3.simdInternal_);
262 t1 = vaddq_f64(t1, t2);
263 t3 = vaddq_f64(t3, t4);
265 t2 = vaddq_f64(t1, vld1q_f64(m));
266 t4 = vaddq_f64(t3, vld1q_f64(m + 2));
268 vst1q_f64(m + 2, t4);
270 t1 = vaddq_f64(t1, t3);
271 t2 = vpaddq_f64(t1, t1);
273 return vgetq_lane_f64(t2, 0);
278 #endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H