2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
44 #include "impl_ibm_vsx_simd_float.h"
50 static inline void gmx_simdcall gatherLoadTranspose(const float* base,
51 const std::int32_t offset[],
57 __vector float l0, l1, l2, l3;
59 l0 = simdLoad(base + align * offset[0]).simdInternal_;
60 l1 = simdLoad(base + align * offset[1]).simdInternal_;
61 l2 = simdLoad(base + align * offset[2]).simdInternal_;
62 l3 = simdLoad(base + align * offset[3]).simdInternal_;
64 __vector float t0 = vec_mergeh(l0, l2);
65 __vector float t1 = vec_mergel(l0, l2);
66 __vector float t2 = vec_mergeh(l1, l3);
67 __vector float t3 = vec_mergel(l1, l3);
68 v0->simdInternal_ = vec_mergeh(t0, t2);
69 v1->simdInternal_ = vec_mergel(t0, t2);
70 v2->simdInternal_ = vec_mergeh(t1, t3);
71 v3->simdInternal_ = vec_mergel(t1, t3);
75 static inline void gmx_simdcall
76 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
78 __vector float t0, t1, t2, t3;
80 t0 = reinterpret_cast<__vector float>(
81 vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
82 t1 = reinterpret_cast<__vector float>(
83 vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
84 t2 = reinterpret_cast<__vector float>(
85 vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
86 t3 = reinterpret_cast<__vector float>(
87 vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
88 t0 = vec_mergeh(t0, t2);
89 t1 = vec_mergeh(t1, t3);
90 v0->simdInternal_ = vec_mergeh(t0, t1);
91 v1->simdInternal_ = vec_mergel(t0, t1);
94 static const int c_simdBestPairAlignmentFloat = 2;
97 static inline void gmx_simdcall gatherLoadUTranspose(const float* base,
98 const std::int32_t offset[],
107 gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
111 __vector float t1, t2, t3, t4, t5, t6, t7, t8;
112 const __vector unsigned char perm_lo2hi = { 0, 1, 2, 3, 4, 5, 6, 7,
113 16, 17, 18, 19, 20, 21, 22, 23 };
114 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
115 8, 9, 10, 11, 12, 13, 14, 15 };
117 t1 = reinterpret_cast<__vector float>(
118 vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
119 t2 = reinterpret_cast<__vector float>(
120 vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
121 t3 = reinterpret_cast<__vector float>(
122 vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
123 t4 = reinterpret_cast<__vector float>(
124 vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
125 t5 = vec_splats(*(base + align * offset[0] + 2));
126 t6 = vec_splats(*(base + align * offset[1] + 2));
127 t7 = vec_splats(*(base + align * offset[2] + 2));
128 t8 = vec_splats(*(base + align * offset[3] + 2));
130 t1 = vec_mergeh(t1, t2);
131 t3 = vec_mergeh(t3, t4);
132 v0->simdInternal_ = vec_perm(t1, t3, perm_lo2hi);
133 v1->simdInternal_ = vec_perm(t3, t1, perm_hi2lo);
134 t5 = vec_mergeh(t5, t6);
135 t7 = vec_mergeh(t7, t8);
136 v2->simdInternal_ = vec_perm(t5, t7, perm_lo2hi);
141 // gcc-4.9 does not recognize that the argument to vec_extract() is used
143 static inline void gmx_simdcall transposeScatterStoreU(float* base,
144 const std::int32_t offset[],
147 SimdFloat gmx_unused v2)
149 __vector float t1, t2;
151 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
152 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
153 *reinterpret_cast<double*>(base + align * offset[0]) =
154 vec_extract(reinterpret_cast<__vector double>(t1), 0);
155 base[align * offset[0] + 2] = vec_extract(v2.simdInternal_, 0);
156 *reinterpret_cast<double*>(base + align * offset[1]) =
157 vec_extract(reinterpret_cast<__vector double>(t1), 1);
158 base[align * offset[1] + 2] = vec_extract(v2.simdInternal_, 1);
159 *reinterpret_cast<double*>(base + align * offset[2]) =
160 vec_extract(reinterpret_cast<__vector double>(t2), 0);
161 base[align * offset[2] + 2] = vec_extract(v2.simdInternal_, 2);
162 *reinterpret_cast<double*>(base + align * offset[3]) =
163 vec_extract(reinterpret_cast<__vector double>(t2), 1);
164 base[align * offset[3] + 2] = vec_extract(v2.simdInternal_, 3);
168 static inline void gmx_simdcall
169 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
173 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
174 8, 9, 10, 11, 12, 13, 14, 15 };
175 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
177 t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
178 t1 = vec_perm(t0, t0, perm_hi2lo); // x1 y1
179 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
180 t3 = vec_perm(t2, t2, perm_hi2lo); // x3 y3
182 t4 = reinterpret_cast<__vector float>(
183 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
184 t4 = vec_add(t4, t0);
185 *reinterpret_cast<double*>(base + align * offset[0]) =
186 vec_extract(reinterpret_cast<__vector double>(t4), 0);
188 float extracted = vec_extract(v2.simdInternal_, 0);
189 base[align * offset[0] + 2] += extracted;
192 t5 = reinterpret_cast<__vector float>(
193 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
194 t5 = vec_add(t5, t1);
195 *reinterpret_cast<double*>(base + align * offset[1]) =
196 vec_extract(reinterpret_cast<__vector double>(t5), 0);
198 float extracted = vec_extract(v2.simdInternal_, 1);
199 base[align * offset[1] + 2] += extracted;
202 t6 = reinterpret_cast<__vector float>(
203 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
204 t6 = vec_add(t6, t2);
205 *reinterpret_cast<double*>(base + align * offset[2]) =
206 vec_extract(reinterpret_cast<__vector double>(t6), 0);
208 float extracted = vec_extract(v2.simdInternal_, 2);
209 base[align * offset[2] + 2] += extracted;
212 t7 = reinterpret_cast<__vector float>(
213 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
214 t7 = vec_add(t7, t3);
215 *reinterpret_cast<double*>(base + align * offset[3]) =
216 vec_extract(reinterpret_cast<__vector double>(t7), 0);
218 float extracted = vec_extract(v2.simdInternal_, 3);
219 base[align * offset[3] + 2] += extracted;
224 // Extra elements means we can use full width-4 load/store operations
226 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
227 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
228 __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
229 __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
230 v0.simdInternal_ = vec_mergeh(t0, t2);
231 v1.simdInternal_ = vec_mergel(t0, t2);
232 v2.simdInternal_ = vec_mergeh(t1, t3);
233 v3.simdInternal_ = vec_mergel(t1, t3);
235 store(base + align * offset[0], simdLoad(base + align * offset[0]) + v0);
236 store(base + align * offset[1], simdLoad(base + align * offset[1]) + v1);
237 store(base + align * offset[2], simdLoad(base + align * offset[2]) + v2);
238 store(base + align * offset[3], simdLoad(base + align * offset[3]) + v3);
243 static inline void gmx_simdcall
244 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
248 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
249 8, 9, 10, 11, 12, 13, 14, 15 };
250 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
252 t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
253 t1 = vec_perm(t0, t0, perm_hi2lo); // x1 y1
254 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
255 t3 = vec_perm(t2, t2, perm_hi2lo); // x3 y3
257 t4 = reinterpret_cast<__vector float>(
258 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
259 t4 = vec_sub(t4, t0);
260 *reinterpret_cast<double*>(base + align * offset[0]) =
261 vec_extract(reinterpret_cast<__vector double>(t4), 0);
263 float extracted = vec_extract(v2.simdInternal_, 0);
264 base[align * offset[0] + 2] -= extracted;
267 t5 = reinterpret_cast<__vector float>(
268 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
269 t5 = vec_sub(t5, t1);
270 *reinterpret_cast<double*>(base + align * offset[1]) =
271 vec_extract(reinterpret_cast<__vector double>(t5), 0);
273 float extracted = vec_extract(v2.simdInternal_, 1);
274 base[align * offset[1] + 2] -= extracted;
277 t6 = reinterpret_cast<__vector float>(
278 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
279 t6 = vec_sub(t6, t2);
280 *reinterpret_cast<double*>(base + align * offset[2]) =
281 vec_extract(reinterpret_cast<__vector double>(t6), 0);
283 float extracted = vec_extract(v2.simdInternal_, 2);
284 base[align * offset[2] + 2] -= extracted;
287 t7 = reinterpret_cast<__vector float>(
288 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
289 t7 = vec_sub(t7, t3);
290 *reinterpret_cast<double*>(base + align * offset[3]) =
291 vec_extract(reinterpret_cast<__vector double>(t7), 0);
293 float extracted = vec_extract(v2.simdInternal_, 3);
294 base[align * offset[3] + 2] -= extracted;
299 // Extra elements means we can use full width-4 load/store operations
301 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
302 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
303 __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
304 __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
305 v0.simdInternal_ = vec_mergeh(t0, t2);
306 v1.simdInternal_ = vec_mergel(t0, t2);
307 v2.simdInternal_ = vec_mergeh(t1, t3);
308 v3.simdInternal_ = vec_mergel(t1, t3);
310 store(base + align * offset[0], simdLoad(base + align * offset[0]) - v0);
311 store(base + align * offset[1], simdLoad(base + align * offset[1]) - v1);
312 store(base + align * offset[2], simdLoad(base + align * offset[2]) - v2);
313 store(base + align * offset[3], simdLoad(base + align * offset[3]) - v3);
317 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat scalar,
318 SimdFloat* triplets0,
319 SimdFloat* triplets1,
320 SimdFloat* triplets2)
322 // These permutes will be translated to immediate permutes (xxpermdi)
323 // since they operate on doublewords, which will be faster than loading
324 // the constants required for fully flexible permutes.
325 // (although the real reason was that the latter was buggy on xlc-13.1).
326 __vector unsigned char perm0 = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
327 __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
328 __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
329 __vector float t0, t1;
331 t0 = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
332 t1 = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
333 triplets0->simdInternal_ = vec_perm(t0, scalar.simdInternal_, perm0);
334 triplets1->simdInternal_ = vec_perm(t0, t1, perm1);
335 triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, t1, perm2);
338 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
339 leading to segfaults. Possibly the calling convention doesn't
340 implement __vector int correctly. Release mode is OK. gcc is OK. */
342 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
349 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
351 store(ioffset, offset);
352 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
356 static inline void gmx_simdcall
357 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
359 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
361 store(ioffset, offset);
362 gatherLoadTranspose<align>(base, ioffset, v0, v1);
366 static inline void gmx_simdcall
367 gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
369 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
371 store(ioffset, offset);
372 gatherLoadTranspose<align>(base, ioffset, v0, v1);
375 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
377 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
378 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
379 __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
380 __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
381 v0.simdInternal_ = vec_mergeh(t0, t2);
382 v1.simdInternal_ = vec_mergel(t0, t2);
383 v2.simdInternal_ = vec_mergeh(t1, t3);
384 v3.simdInternal_ = vec_mergel(t1, t3);
389 v2 = v0 + simdLoad(m);
397 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H