2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_DOUBLE_H
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
44 #include "impl_ibm_vsx_simd_double.h"
50 static inline void gmx_simdcall gatherLoadTranspose(const double* base,
51 const std::int32_t offset[],
57 __vector double t1, t2, t3, t4;
59 t1 = *reinterpret_cast<const __vector double*>(base + align * offset[0]);
60 t2 = *reinterpret_cast<const __vector double*>(base + align * offset[1]);
61 t3 = *reinterpret_cast<const __vector double*>(base + align * offset[0] + 2);
62 t4 = *reinterpret_cast<const __vector double*>(base + align * offset[1] + 2);
63 v0->simdInternal_ = vec_mergeh(t1, t2);
64 v1->simdInternal_ = vec_mergel(t1, t2);
65 v2->simdInternal_ = vec_mergeh(t3, t4);
66 v3->simdInternal_ = vec_mergel(t3, t4);
70 static inline void gmx_simdcall
71 gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
73 __vector double t1, t2;
75 t1 = *reinterpret_cast<const __vector double*>(base + align * offset[0]);
76 t2 = *reinterpret_cast<const __vector double*>(base + align * offset[1]);
77 v0->simdInternal_ = vec_mergeh(t1, t2);
78 v1->simdInternal_ = vec_mergel(t1, t2);
81 static const int c_simdBestPairAlignmentDouble = 2;
84 static inline void gmx_simdcall gatherLoadUTranspose(const double* base,
85 const std::int32_t offset[],
92 t1 = simdLoad(base + align * offset[0]);
93 t2 = simdLoad(base + align * offset[1]);
95 v0->simdInternal_ = vec_mergeh(t1.simdInternal_, t2.simdInternal_);
96 v1->simdInternal_ = vec_mergel(t1.simdInternal_, t2.simdInternal_);
97 v2->simdInternal_ = vec_mergeh(vec_splats(*(base + align * offset[0] + 2)),
98 vec_splats(*(base + align * offset[1] + 2)));
101 // gcc-4.9 fails to recognize that the argument to vec_extract() is used
103 static inline void gmx_simdcall transposeScatterStoreU(double* base,
104 const std::int32_t offset[],
107 SimdDouble gmx_unused v2)
111 t1.simdInternal_ = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
112 t2.simdInternal_ = vec_mergel(v0.simdInternal_, v1.simdInternal_);
114 store(base + align * offset[0], t1);
115 base[align * offset[0] + 2] = vec_extract(v2.simdInternal_, 0);
116 store(base + align * offset[1], t2);
117 base[align * offset[1] + 2] = vec_extract(v2.simdInternal_, 1);
121 static inline void gmx_simdcall
122 transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
126 __vector double t1, t2, t3, t4;
127 SimdDouble t5, t6, t7, t8;
129 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
130 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
131 t3 = vec_mergeh(v2.simdInternal_, vec_splats(0.0));
132 t4 = vec_mergel(v2.simdInternal_, vec_splats(0.0));
134 t5 = simdLoad(base + align * offset[0]);
135 t6 = simdLoad(base + align * offset[0] + 2);
136 t5.simdInternal_ = vec_add(t5.simdInternal_, t1);
137 t6.simdInternal_ = vec_add(t6.simdInternal_, t3);
138 store(base + align * offset[0], t5);
139 store(base + align * offset[0] + 2, t6);
141 t5 = simdLoad(base + align * offset[1]);
142 t6 = simdLoad(base + align * offset[1] + 2);
143 t5.simdInternal_ = vec_add(t5.simdInternal_, t2);
144 t6.simdInternal_ = vec_add(t6.simdInternal_, t4);
145 store(base + align * offset[1], t5);
146 store(base + align * offset[1] + 2, t6);
150 __vector double t1, t2;
153 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
154 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
156 t3 = simdLoad(base + align * offset[0]);
157 t3.simdInternal_ = vec_add(t3.simdInternal_, t1);
158 store(base + align * offset[0], t3);
159 base[align * offset[0] + 2] += vec_extract(v2.simdInternal_, 0);
161 t4 = simdLoad(base + align * offset[1]);
162 t4.simdInternal_ = vec_add(t4.simdInternal_, t2);
163 store(base + align * offset[1], t4);
164 base[align * offset[1] + 2] += vec_extract(v2.simdInternal_, 1);
169 static inline void gmx_simdcall
170 transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
174 __vector double t1, t2, t3, t4;
175 SimdDouble t5, t6, t7, t8;
177 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
178 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
179 t3 = vec_mergeh(v2.simdInternal_, vec_splats(0.0));
180 t4 = vec_mergel(v2.simdInternal_, vec_splats(0.0));
182 t5 = simdLoad(base + align * offset[0]);
183 t6 = simdLoad(base + align * offset[0] + 2);
184 t5.simdInternal_ = vec_sub(t5.simdInternal_, t1);
185 t6.simdInternal_ = vec_sub(t6.simdInternal_, t3);
186 store(base + align * offset[0], t5);
187 store(base + align * offset[0] + 2, t6);
189 t5 = simdLoad(base + align * offset[1]);
190 t6 = simdLoad(base + align * offset[1] + 2);
191 t5.simdInternal_ = vec_sub(t5.simdInternal_, t2);
192 t6.simdInternal_ = vec_sub(t6.simdInternal_, t4);
193 store(base + align * offset[1], t5);
194 store(base + align * offset[1] + 2, t6);
198 __vector double t1, t2;
201 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
202 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
204 t3 = simdLoad(base + align * offset[0]);
205 t3.simdInternal_ = vec_sub(t3.simdInternal_, t1);
206 store(base + align * offset[0], t3);
207 base[align * offset[0] + 2] -= vec_extract(v2.simdInternal_, 0);
209 t4 = simdLoad(base + align * offset[1]);
210 t4.simdInternal_ = vec_sub(t4.simdInternal_, t2);
211 store(base + align * offset[1], t4);
212 base[align * offset[1] + 2] -= vec_extract(v2.simdInternal_, 1);
216 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble scalar,
217 SimdDouble* triplets0,
218 SimdDouble* triplets1,
219 SimdDouble* triplets2)
221 triplets0->simdInternal_ = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
222 triplets1->simdInternal_ = scalar.simdInternal_;
223 triplets2->simdInternal_ = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
227 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
234 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
236 store(ioffset, offset);
237 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
241 static inline void gmx_simdcall
242 gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
244 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
246 store(ioffset, offset);
247 gatherLoadTranspose<align>(base, ioffset, v0, v1);
252 static inline void gmx_simdcall
253 gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
255 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
257 store(ioffset, offset);
259 SimdDouble t1 = simdLoadU(base + align * ioffset[0]);
260 SimdDouble t2 = simdLoadU(base + align * ioffset[1]);
261 v0->simdInternal_ = vec_mergeh(t1.simdInternal_, t2.simdInternal_);
262 v1->simdInternal_ = vec_mergel(t1.simdInternal_, t2.simdInternal_);
265 static inline double gmx_simdcall
266 reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
268 __vector double t1, t2, t3, t4;
270 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
271 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
272 t3 = vec_mergeh(v2.simdInternal_, v3.simdInternal_);
273 t4 = vec_mergel(v2.simdInternal_, v3.simdInternal_);
275 t1 = vec_add(t1, t2);
276 t3 = vec_add(t3, t4);
278 *reinterpret_cast<__vector double*>(m) += t1;
279 *reinterpret_cast<__vector double*>(m + 2) += t3;
281 t1 = vec_add(t1, t3);
287 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_DOUBLE_H