2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
47 #include "gromacs/utility/basedefinitions.h"
49 #include "impl_arm_neon_asimd_simd_double.h"
55 static gmx_inline void gmx_simdcall
56 gatherLoadTranspose(const double * base,
57 const std::int32_t offset[],
63 float64x2_t t1, t2, t3, t4;
65 assert(std::size_t(offset) % 8 == 0);
66 assert(std::size_t(base) % 16 == 0);
67 assert(align % 2 == 0);
69 t1 = vld1q_f64(base + align * offset[0]);
70 t2 = vld1q_f64(base + align * offset[1]);
71 t3 = vld1q_f64(base + align * offset[0] + 2);
72 t4 = vld1q_f64(base + align * offset[1] + 2);
73 v0->simdInternal_ = vuzp1q_f64(t1, t2);
74 v1->simdInternal_ = vuzp2q_f64(t1, t2);
75 v2->simdInternal_ = vuzp1q_f64(t3, t4);
76 v3->simdInternal_ = vuzp2q_f64(t3, t4);
80 static gmx_inline void gmx_simdcall
81 gatherLoadTranspose(const double * base,
82 const std::int32_t offset[],
88 assert(std::size_t(offset) % 8 == 0);
89 assert(std::size_t(base) % 16 == 0);
90 assert(align % 2 == 0);
92 t1 = vld1q_f64(base + align * offset[0]);
93 t2 = vld1q_f64(base + align * offset[1]);
94 v0->simdInternal_ = vuzp1q_f64(t1, t2);
95 v1->simdInternal_ = vuzp2q_f64(t1, t2);
98 static const int c_simdBestPairAlignmentDouble = 2;
101 static gmx_inline void gmx_simdcall
102 gatherLoadUTranspose(const double * base,
103 const std::int32_t offset[],
111 assert(std::size_t(offset) % 8 == 0);
113 t1 = vld1q_f64(base + align * offset[0]);
114 t2 = vld1q_f64(base + align * offset[1]);
115 t3 = vld1_f64(base + align * offset[0] + 2);
116 t4 = vld1_f64(base + align * offset[1] + 2);
117 v0->simdInternal_ = vuzp1q_f64(t1, t2);
118 v1->simdInternal_ = vuzp2q_f64(t1, t2);
119 v2->simdInternal_ = vcombine_f64(t3, t4);
123 static gmx_inline void gmx_simdcall
124 transposeScatterStoreU(double * base,
125 const std::int32_t offset[],
132 assert(std::size_t(offset) % 8 == 0);
134 t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
135 t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
136 vst1q_f64(base + align * offset[0], t0);
137 vst1q_f64(base + align * offset[1], t1);
138 vst1_f64(base + align * offset[0] + 2, vget_low_f64(v2.simdInternal_));
139 vst1_f64(base + align * offset[1] + 2, vget_high_f64(v2.simdInternal_));
143 static gmx_inline void gmx_simdcall
144 transposeScatterIncrU(double * base,
145 const std::int32_t offset[],
150 float64x2_t t0, t1, t2;
153 assert(std::size_t(offset) % 8 == 0);
155 t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
156 t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
158 t2 = vld1q_f64(base + align * offset[0]);
159 t2 = vaddq_f64(t2, t0);
160 vst1q_f64(base + align * offset[0], t2);
162 t3 = vld1_f64(base + align * offset[0] + 2);
163 t3 = vadd_f64(t3, vget_low_f64(v2.simdInternal_));
164 vst1_f64(base + align * offset[0] + 2, t3);
166 t2 = vld1q_f64(base + align * offset[1]);
167 t2 = vaddq_f64(t2, t1);
168 vst1q_f64(base + align * offset[1], t2);
170 t3 = vld1_f64(base + align * offset[1] + 2);
171 t3 = vadd_f64(t3, vget_high_f64(v2.simdInternal_));
172 vst1_f64(base + align * offset[1] + 2, t3);
176 static gmx_inline void gmx_simdcall
177 transposeScatterDecrU(double * base,
178 const std::int32_t offset[],
183 float64x2_t t0, t1, t2;
186 assert(std::size_t(offset) % 8 == 0);
188 t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
189 t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
191 t2 = vld1q_f64(base + align * offset[0]);
192 t2 = vsubq_f64(t2, t0);
193 vst1q_f64(base + align * offset[0], t2);
195 t3 = vld1_f64(base + align * offset[0] + 2);
196 t3 = vsub_f64(t3, vget_low_f64(v2.simdInternal_));
197 vst1_f64(base + align * offset[0] + 2, t3);
199 t2 = vld1q_f64(base + align * offset[1]);
200 t2 = vsubq_f64(t2, t1);
201 vst1q_f64(base + align * offset[1], t2);
203 t3 = vld1_f64(base + align * offset[1] + 2);
204 t3 = vsub_f64(t3, vget_high_f64(v2.simdInternal_));
205 vst1_f64(base + align * offset[1] + 2, t3);
208 static gmx_inline void gmx_simdcall
209 expandScalarsToTriplets(SimdDouble scalar,
210 SimdDouble * triplets0,
211 SimdDouble * triplets1,
212 SimdDouble * triplets2)
214 triplets0->simdInternal_ = vuzp1q_f64(scalar.simdInternal_, scalar.simdInternal_);
215 triplets1->simdInternal_ = scalar.simdInternal_;
216 triplets2->simdInternal_ = vuzp2q_f64(scalar.simdInternal_, scalar.simdInternal_);
220 static gmx_inline void gmx_simdcall
221 gatherLoadBySimdIntTranspose(const double * base,
228 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
230 assert(std::size_t(base) % 16 == 0);
231 assert(align % 2 == 0);
233 vst1_s32(ioffset, offset.simdInternal_);
234 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
239 static gmx_inline void gmx_simdcall
240 gatherLoadBySimdIntTranspose(const double * base,
245 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
247 assert(std::size_t(base) % 16 == 0);
248 assert(align % 2 == 0);
250 vst1_s32(ioffset, offset.simdInternal_);
251 gatherLoadTranspose<align>(base, ioffset, v0, v1);
255 static gmx_inline void gmx_simdcall
256 gatherLoadUBySimdIntTranspose(const double * base,
261 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
263 vst1_s32(ioffset, offset.simdInternal_);
267 t1 = vld1q_f64(base + align * ioffset[0]);
268 t2 = vld1q_f64(base + align * ioffset[1]);
269 v0->simdInternal_ = vuzp1q_f64(t1, t2);
270 v1->simdInternal_ = vuzp2q_f64(t1, t2);
274 static gmx_inline double gmx_simdcall
275 reduceIncr4ReturnSum(double * m,
281 float64x2_t t1, t2, t3, t4;
283 assert(std::size_t(m) % 8 == 0);
285 t1 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
286 t2 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
287 t3 = vuzp1q_f64(v2.simdInternal_, v3.simdInternal_);
288 t4 = vuzp2q_f64(v2.simdInternal_, v3.simdInternal_);
290 t1 = vaddq_f64(t1, t2);
291 t3 = vaddq_f64(t3, t4);
293 t2 = vaddq_f64(t1, vld1q_f64(m));
294 t4 = vaddq_f64(t3, vld1q_f64(m + 2));
296 vst1q_f64(m + 2, t4);
298 t1 = vaddq_f64(t1, t3);
299 t2 = vpaddq_f64(t1, t1);
301 return vgetq_lane_f64(t2, 0);
306 #endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H