2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
36 #define GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
43 #include "gromacs/utility/basedefinitions.h"
45 #include "impl_ibm_vmx_definitions.h"
46 #include "impl_ibm_vmx_simd_float.h"
52 static inline void gmx_simdcall
53 gatherLoadTranspose(const float * base,
54 const std::int32_t offset[],
60 *v0 = simdLoad( base + align * offset[0] );
61 *v1 = simdLoad( base + align * offset[1] );
62 *v2 = simdLoad( base + align * offset[2] );
63 *v3 = simdLoad( base + align * offset[3] );
65 __vector float t0 = vec_mergeh(v0->simdInternal_, v2->simdInternal_);
66 __vector float t1 = vec_mergel(v0->simdInternal_, v2->simdInternal_);
67 __vector float t2 = vec_mergeh(v1->simdInternal_, v3->simdInternal_);
68 __vector float t3 = vec_mergel(v1->simdInternal_, v3->simdInternal_);
69 v0->simdInternal_ = vec_mergeh(t0, t2);
70 v1->simdInternal_ = vec_mergel(t0, t2);
71 v2->simdInternal_ = vec_mergeh(t1, t3);
72 v3->simdInternal_ = vec_mergel(t1, t3);
76 static inline void gmx_simdcall
77 gatherLoadTranspose(const float * base,
78 const std::int32_t offset[],
86 gatherLoadTranspose<align>(base, offset, v0, v1, &t2, &t3);
90 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
91 __vector unsigned char p0, p1, p2, p3;
93 // This is REALLY slow, since we have no choice but to load individual
94 // elements when we cannot guarantee that we can access beyond the end of
95 // the memory. Fortunately, 99% of the usage should be the aligned-to-4
96 // case above instead.
97 t0 = vec_lde(0, base + align * offset[0]);
98 t1 = vec_lde(0, base + align * offset[1]);
99 t2 = vec_lde(0, base + align * offset[2]);
100 t3 = vec_lde(0, base + align * offset[3]);
101 p0 = vec_lvsl(0, base + align * offset[0]);
102 p1 = vec_lvsl(0, base + align * offset[1]);
103 p2 = vec_lvsl(0, base + align * offset[2]);
104 p3 = vec_lvsl(0, base + align * offset[3]);
105 t0 = vec_perm(t0, t0, p0);
106 t1 = vec_perm(t1, t1, p1);
107 t2 = vec_perm(t2, t2, p2);
108 t3 = vec_perm(t3, t3, p3);
109 t0 = vec_mergeh(t0, t2);
110 t1 = vec_mergeh(t1, t3);
111 v0->simdInternal_ = vec_mergeh(t0, t1);
113 t4 = vec_lde(0, base + align * offset[0] + 1);
114 t5 = vec_lde(0, base + align * offset[1] + 1);
115 t6 = vec_lde(0, base + align * offset[2] + 1);
116 t7 = vec_lde(0, base + align * offset[3] + 1);
117 p0 = vec_lvsl(0, base + align * offset[0] + 1);
118 p1 = vec_lvsl(0, base + align * offset[1] + 1);
119 p2 = vec_lvsl(0, base + align * offset[2] + 1);
120 p3 = vec_lvsl(0, base + align * offset[3] + 1);
121 t4 = vec_perm(t4, t4, p0);
122 t5 = vec_perm(t5, t5, p1);
123 t6 = vec_perm(t6, t6, p2);
124 t7 = vec_perm(t7, t7, p3);
125 t4 = vec_mergeh(t4, t6);
126 t5 = vec_mergeh(t5, t7);
127 v1->simdInternal_ = vec_mergeh(t4, t5);
131 static const int c_simdBestPairAlignmentFloat = 2;
134 static inline void gmx_simdcall
135 gatherLoadUTranspose(const float * base,
136 const std::int32_t offset[],
144 gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
148 __vector float t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
149 __vector unsigned char p0, p1, p2, p3;
151 // This is REALLY slow, since we have no choice but to load individual
152 // elements when we cannot guarantee that we can access beyond the end of
153 // the memory. Unfortunately this is likely the most common case.
154 t0 = vec_lde(0, base + align * offset[0]);
155 t1 = vec_lde(0, base + align * offset[1]);
156 t2 = vec_lde(0, base + align * offset[2]);
157 t3 = vec_lde(0, base + align * offset[3]);
158 p0 = vec_lvsl(0, base + align * offset[0]);
159 p1 = vec_lvsl(0, base + align * offset[1]);
160 p2 = vec_lvsl(0, base + align * offset[2]);
161 p3 = vec_lvsl(0, base + align * offset[3]);
162 t0 = vec_perm(t0, t0, p0);
163 t1 = vec_perm(t1, t1, p1);
164 t2 = vec_perm(t2, t2, p2);
165 t3 = vec_perm(t3, t3, p3);
166 t0 = vec_mergeh(t0, t2);
167 t1 = vec_mergeh(t1, t3);
168 v0->simdInternal_ = vec_mergeh(t0, t1);
170 t4 = vec_lde(0, base + align * offset[0] + 1);
171 t5 = vec_lde(0, base + align * offset[1] + 1);
172 t6 = vec_lde(0, base + align * offset[2] + 1);
173 t7 = vec_lde(0, base + align * offset[3] + 1);
174 p0 = vec_lvsl(0, base + align * offset[0] + 1);
175 p1 = vec_lvsl(0, base + align * offset[1] + 1);
176 p2 = vec_lvsl(0, base + align * offset[2] + 1);
177 p3 = vec_lvsl(0, base + align * offset[3] + 1);
178 t4 = vec_perm(t4, t4, p0);
179 t5 = vec_perm(t5, t5, p1);
180 t6 = vec_perm(t6, t6, p2);
181 t7 = vec_perm(t7, t7, p3);
182 t4 = vec_mergeh(t4, t6);
183 t5 = vec_mergeh(t5, t7);
184 v1->simdInternal_ = vec_mergeh(t4, t5);
186 t8 = vec_lde(0, base + align * offset[0] + 2);
187 t9 = vec_lde(0, base + align * offset[1] + 2);
188 t10 = vec_lde(0, base + align * offset[2] + 2);
189 t11 = vec_lde(0, base + align * offset[3] + 2);
190 p0 = vec_lvsl(0, base + align * offset[0] + 2);
191 p1 = vec_lvsl(0, base + align * offset[1] + 2);
192 p2 = vec_lvsl(0, base + align * offset[2] + 2);
193 p3 = vec_lvsl(0, base + align * offset[3] + 2);
194 t8 = vec_perm(t8, t8, p0);
195 t9 = vec_perm(t9, t9, p1);
196 t10 = vec_perm(t10, t10, p2);
197 t11 = vec_perm(t11, t11, p3);
198 t8 = vec_mergeh(t8, t10);
199 t9 = vec_mergeh(t9, t11);
200 v2->simdInternal_ = vec_mergeh(t8, t9);
206 static inline void gmx_simdcall
207 transposeScatterStoreU(float * base,
208 const std::int32_t offset[],
213 __vector unsigned char p0, p1, p2, p3;
215 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
216 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
217 __vector float t2 = vec_mergeh(v1.simdInternal_, v2.simdInternal_);
218 __vector float t3 = vec_mergel(v1.simdInternal_, v2.simdInternal_);
219 __vector float t4 = vec_mergeh(t0, t2);
220 __vector float t5 = vec_mergel(t0, t2);
221 __vector float t6 = vec_mergeh(t1, t3);
222 __vector float t7 = vec_mergel(t1, t3);
224 p0 = vec_lvsr(0, base + align * offset[0]);
225 p1 = vec_lvsr(0, base + align * offset[1]);
226 p2 = vec_lvsr(0, base + align * offset[2]);
227 p3 = vec_lvsr(0, base + align * offset[3]);
229 t4 = vec_perm(t4, t4, p0);
230 t5 = vec_perm(t5, t5, p1);
231 t6 = vec_perm(t6, t6, p2);
232 t7 = vec_perm(t7, t7, p3);
234 vec_ste(t4, 0, base + align * offset[0]);
235 vec_ste(t4, 4, base + align * offset[0]);
236 vec_ste(t4, 8, base + align * offset[0]);
237 vec_ste(t5, 0, base + align * offset[1]);
238 vec_ste(t5, 4, base + align * offset[1]);
239 vec_ste(t5, 8, base + align * offset[1]);
240 vec_ste(t6, 0, base + align * offset[2]);
241 vec_ste(t6, 4, base + align * offset[2]);
242 vec_ste(t6, 8, base + align * offset[2]);
243 vec_ste(t7, 0, base + align * offset[3]);
244 vec_ste(t7, 4, base + align * offset[3]);
245 vec_ste(t7, 8, base + align * offset[3]);
250 static inline void gmx_simdcall
251 transposeScatterIncrU(float * base,
252 const std::int32_t offset[],
259 __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
260 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
261 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
262 __vector float t2 = vec_mergeh(v1.simdInternal_, zero);
263 __vector float t3 = vec_mergel(v1.simdInternal_, zero);
264 __vector float t4 = vec_mergeh(t0, t2);
265 __vector float t5 = vec_mergel(t0, t2);
266 __vector float t6 = vec_mergeh(t1, t3);
267 __vector float t7 = vec_mergel(t1, t3);
269 vec_st( vec_add( vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
270 vec_st( vec_add( vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
271 vec_st( vec_add( vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
272 vec_st( vec_add( vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
276 alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
277 alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
278 alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
280 vec_st(v0.simdInternal_, 0, rdata0);
281 vec_st(v1.simdInternal_, 0, rdata1);
282 vec_st(v2.simdInternal_, 0, rdata2);
284 base[align*offset[0] + 0] += rdata0[0];
285 base[align*offset[0] + 1] += rdata1[0];
286 base[align*offset[0] + 2] += rdata2[0];
287 base[align*offset[1] + 0] += rdata0[1];
288 base[align*offset[1] + 1] += rdata1[1];
289 base[align*offset[1] + 2] += rdata2[1];
290 base[align*offset[2] + 0] += rdata0[2];
291 base[align*offset[2] + 1] += rdata1[2];
292 base[align*offset[2] + 2] += rdata2[2];
293 base[align*offset[3] + 0] += rdata0[3];
294 base[align*offset[3] + 1] += rdata1[3];
295 base[align*offset[3] + 2] += rdata2[3];
300 static inline void gmx_simdcall
301 transposeScatterDecrU(float * base,
302 const std::int32_t offset[],
309 __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
310 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
311 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
312 __vector float t2 = vec_mergeh(v1.simdInternal_, zero);
313 __vector float t3 = vec_mergel(v1.simdInternal_, zero);
314 __vector float t4 = vec_mergeh(t0, t2);
315 __vector float t5 = vec_mergel(t0, t2);
316 __vector float t6 = vec_mergeh(t1, t3);
317 __vector float t7 = vec_mergel(t1, t3);
319 vec_st( vec_sub( vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
320 vec_st( vec_sub( vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
321 vec_st( vec_sub( vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
322 vec_st( vec_sub( vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
326 alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
327 alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
328 alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
330 vec_st(v0.simdInternal_, 0, rdata0);
331 vec_st(v1.simdInternal_, 0, rdata1);
332 vec_st(v2.simdInternal_, 0, rdata2);
334 base[align*offset[0] + 0] -= rdata0[0];
335 base[align*offset[0] + 1] -= rdata1[0];
336 base[align*offset[0] + 2] -= rdata2[0];
337 base[align*offset[1] + 0] -= rdata0[1];
338 base[align*offset[1] + 1] -= rdata1[1];
339 base[align*offset[1] + 2] -= rdata2[1];
340 base[align*offset[2] + 0] -= rdata0[2];
341 base[align*offset[2] + 1] -= rdata1[2];
342 base[align*offset[2] + 2] -= rdata2[2];
343 base[align*offset[3] + 0] -= rdata0[3];
344 base[align*offset[3] + 1] -= rdata1[3];
345 base[align*offset[3] + 2] -= rdata2[3];
349 static inline void gmx_simdcall
350 expandScalarsToTriplets(SimdFloat scalar,
351 SimdFloat * triplets0,
352 SimdFloat * triplets1,
353 SimdFloat * triplets2)
355 const __vector unsigned char perm0 = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 };
356 const __vector unsigned char perm1 = { 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11 };
357 const __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };
359 triplets0->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm0);
360 triplets1->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm1);
361 triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm2);
366 static inline void gmx_simdcall
367 gatherLoadBySimdIntTranspose(const float * base,
374 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
376 vec_st( offset.simdInternal_, 0, ioffset);
377 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
381 static inline void gmx_simdcall
382 gatherLoadBySimdIntTranspose(const float * base,
387 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
389 vec_st( offset.simdInternal_, 0, ioffset);
390 gatherLoadTranspose<align>(base, ioffset, v0, v1);
395 static inline float gmx_simdcall
396 reduceIncr4ReturnSum(float * m,
402 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
403 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
404 __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
405 __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
406 v0.simdInternal_ = vec_mergeh(t0, t2);
407 v1.simdInternal_ = vec_mergel(t0, t2);
408 v2.simdInternal_ = vec_mergeh(t1, t3);
409 v3.simdInternal_ = vec_mergel(t1, t3);
414 v2 = v0 + simdLoad(m);
422 #endif // GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H