2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_QPX_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_QPX_UTIL_DOUBLE_H
41 // Assert is buggy on xlc with high optimization, so we skip it for QPX
49 #include "gromacs/utility/basedefinitions.h"
51 #include "impl_ibm_qpx_simd_double.h"
57 static inline void gmx_simdcall
58 gatherLoadTranspose(const double * base,
59 const std::int32_t offset[],
65 v0->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[0]) );
66 v1->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[1]) );
67 v2->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[2]) );
68 v3->simdInternal_ = vec_ld(0, const_cast<double *>(base + align * offset[3]) );
70 vector4double t0 = vec_perm(v0->simdInternal_, v2->simdInternal_, vec_gpci(00415));
71 vector4double t1 = vec_perm(v0->simdInternal_, v2->simdInternal_, vec_gpci(02637));
72 vector4double t2 = vec_perm(v1->simdInternal_, v3->simdInternal_, vec_gpci(00415));
73 vector4double t3 = vec_perm(v1->simdInternal_, v3->simdInternal_, vec_gpci(02637));
74 v0->simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
75 v1->simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
76 v2->simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
77 v3->simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
81 static inline void gmx_simdcall
82 gatherLoadTranspose(const double * base,
83 const std::int32_t offset[],
87 vector4double t0, t1, t2, t3;
89 t0 = vec_ld2(0, const_cast<double *>(base + align * offset[0]) );
90 t1 = vec_ld2(0, const_cast<double *>(base + align * offset[1]) );
91 t2 = vec_ld2(0, const_cast<double *>(base + align * offset[2]) );
92 t3 = vec_ld2(0, const_cast<double *>(base + align * offset[3]) );
93 t0 = vec_perm(t0, t2, vec_gpci(00415));
94 t1 = vec_perm(t1, t3, vec_gpci(00415));
95 v0->simdInternal_ = vec_perm(t0, t1, vec_gpci(00415));
96 v1->simdInternal_ = vec_perm(t0, t1, vec_gpci(02637));
99 static const int c_simdBestPairAlignmentDouble = 2;
102 static inline void gmx_simdcall
103 gatherLoadUTranspose(const double * base,
104 const std::int32_t offset[],
109 vector4double t1, t2, t3, t4, t5, t6, t7, t8;
114 gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
118 t1 = vec_perm(vec_splats(base[align * offset[0]]), vec_splats(base[align * offset[0] + 1]), vec_gpci(00415));
119 t2 = vec_perm(vec_splats(base[align * offset[1]]), vec_splats(base[align * offset[1] + 1]), vec_gpci(00415));
120 t3 = vec_perm(vec_splats(base[align * offset[2]]), vec_splats(base[align * offset[2] + 1]), vec_gpci(00415));
121 t4 = vec_perm(vec_splats(base[align * offset[3]]), vec_splats(base[align * offset[3] + 1]), vec_gpci(00415));
123 t5 = vec_splats( *(base + align * offset[0] + 2) );
124 t6 = vec_splats( *(base + align * offset[1] + 2) );
125 t7 = vec_splats( *(base + align * offset[2] + 2) );
126 t8 = vec_splats( *(base + align * offset[3] + 2) );
128 t1 = vec_perm(t1, t2, vec_gpci(00415));
129 t3 = vec_perm(t3, t4, vec_gpci(00415));
130 v0->simdInternal_ = vec_perm(t1, t3, vec_gpci(00145));
131 v1->simdInternal_ = vec_perm(t3, t1, vec_gpci(06723));
132 t5 = vec_perm(t5, t6, vec_gpci(00415));
133 t7 = vec_perm(t7, t8, vec_gpci(00415));
134 v2->simdInternal_ = vec_perm(t5, t7, vec_gpci(00145));
139 static inline void gmx_simdcall
140 transposeScatterStoreU(double * base,
141 const std::int32_t offset[],
146 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m0[GMX_SIMD_DOUBLE_WIDTH];
147 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m1[GMX_SIMD_DOUBLE_WIDTH];
148 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m2[GMX_SIMD_DOUBLE_WIDTH];
154 base[align * offset[0] ] = m0[0];
155 base[align * offset[0] + 1] = m1[0];
156 base[align * offset[0] + 2] = m2[0];
157 base[align * offset[1] ] = m0[1];
158 base[align * offset[1] + 1] = m1[1];
159 base[align * offset[1] + 2] = m2[1];
160 base[align * offset[2] ] = m0[2];
161 base[align * offset[2] + 1] = m1[2];
162 base[align * offset[2] + 2] = m2[2];
163 base[align * offset[3] ] = m0[3];
164 base[align * offset[3] + 1] = m1[3];
165 base[align * offset[3] + 2] = m2[3];
169 static inline void gmx_simdcall
170 transposeScatterIncrU(double * base,
171 const std::int32_t offset[],
180 vector4double t0 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(00415));
181 vector4double t1 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(02637));
182 vector4double t2 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(00415));
183 vector4double t3 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(02637));
184 v0.simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
185 v1.simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
186 v2.simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
187 v3.simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
189 store(base + align*offset[0], simdLoad(base + align*offset[0]) + v0);
190 store(base + align*offset[1], simdLoad(base + align*offset[1]) + v1);
191 store(base + align*offset[2], simdLoad(base + align*offset[2]) + v2);
192 store(base + align*offset[3], simdLoad(base + align*offset[3]) + v3);
196 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m0[GMX_SIMD_DOUBLE_WIDTH];
197 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m1[GMX_SIMD_DOUBLE_WIDTH];
198 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m2[GMX_SIMD_DOUBLE_WIDTH];
204 base[align * offset[0] ] += m0[0];
205 base[align * offset[0] + 1] += m1[0];
206 base[align * offset[0] + 2] += m2[0];
207 base[align * offset[1] ] += m0[1];
208 base[align * offset[1] + 1] += m1[1];
209 base[align * offset[1] + 2] += m2[1];
210 base[align * offset[2] ] += m0[2];
211 base[align * offset[2] + 1] += m1[2];
212 base[align * offset[2] + 2] += m2[2];
213 base[align * offset[3] ] += m0[3];
214 base[align * offset[3] + 1] += m1[3];
215 base[align * offset[3] + 2] += m2[3];
220 static inline void gmx_simdcall
221 transposeScatterDecrU(double * base,
222 const std::int32_t offset[],
231 vector4double t0 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(00415));
232 vector4double t1 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(02637));
233 vector4double t2 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(00415));
234 vector4double t3 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(02637));
235 v0.simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
236 v1.simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
237 v2.simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
238 v3.simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
240 store(base + align*offset[0], simdLoad(base + align*offset[0]) - v0);
241 store(base + align*offset[1], simdLoad(base + align*offset[1]) - v1);
242 store(base + align*offset[2], simdLoad(base + align*offset[2]) - v2);
243 store(base + align*offset[3], simdLoad(base + align*offset[3]) - v3);
247 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m0[GMX_SIMD_DOUBLE_WIDTH];
248 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m1[GMX_SIMD_DOUBLE_WIDTH];
249 GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) m2[GMX_SIMD_DOUBLE_WIDTH];
255 base[align * offset[0] ] -= m0[0];
256 base[align * offset[0] + 1] -= m1[0];
257 base[align * offset[0] + 2] -= m2[0];
258 base[align * offset[1] ] -= m0[1];
259 base[align * offset[1] + 1] -= m1[1];
260 base[align * offset[1] + 2] -= m2[1];
261 base[align * offset[2] ] -= m0[2];
262 base[align * offset[2] + 1] -= m1[2];
263 base[align * offset[2] + 2] -= m2[2];
264 base[align * offset[3] ] -= m0[3];
265 base[align * offset[3] + 1] -= m1[3];
266 base[align * offset[3] + 2] -= m2[3];
270 static inline void gmx_simdcall
271 expandScalarsToTriplets(SimdDouble scalar,
272 SimdDouble * triplets0,
273 SimdDouble * triplets1,
274 SimdDouble * triplets2)
276 triplets0->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, vec_gpci(00001));
277 triplets1->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, vec_gpci(01122));
278 triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, vec_gpci(02333));
282 static inline void gmx_simdcall
283 gatherLoadBySimdIntTranspose(const double * base,
284 SimdDInt32 simdoffset,
290 GMX_ALIGNED(int, GMX_SIMD_DOUBLE_WIDTH) ioffset[GMX_SIMD_DOUBLE_WIDTH];
292 store(ioffset, simdoffset);
293 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
297 static inline void gmx_simdcall
298 gatherLoadBySimdIntTranspose(const double * base,
299 SimdDInt32 simdoffset,
303 GMX_ALIGNED(int, GMX_SIMD_DOUBLE_WIDTH) ioffset[GMX_SIMD_DOUBLE_WIDTH];
305 store(ioffset, simdoffset);
306 gatherLoadTranspose<align>(base, ioffset, v0, v1);
309 static inline double gmx_simdcall
310 reduceIncr4ReturnSum(double * m,
316 vector4double t0 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(00415));
317 vector4double t1 = vec_perm(v0.simdInternal_, v2.simdInternal_, vec_gpci(02637));
318 vector4double t2 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(00415));
319 vector4double t3 = vec_perm(v1.simdInternal_, v3.simdInternal_, vec_gpci(02637));
320 v0.simdInternal_ = vec_perm(t0, t2, vec_gpci(00415));
321 v1.simdInternal_ = vec_perm(t0, t2, vec_gpci(02637));
322 v2.simdInternal_ = vec_perm(t1, t3, vec_gpci(00415));
323 v3.simdInternal_ = vec_perm(t1, t3, vec_gpci(02637));
328 v2 = v0 + simdLoad(m);
336 #endif // GMX_SIMD_IMPLEMENTATION_IBM_QPX_UTIL_DOUBLE_H