2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2019,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_X86_SSE2_UTIL_DOUBLE_H
36 #define GMX_SIMD_IMPL_X86_SSE2_UTIL_DOUBLE_H
44 #include <emmintrin.h>
46 #include "impl_x86_sse2_simd_double.h"
52 static inline void gmx_simdcall gatherLoadTranspose(const double* base,
53 const std::int32_t offset[],
59 __m128d t1, t2, t3, t4;
61 assert(std::size_t(base + align * offset[0]) % 16 == 0);
62 assert(std::size_t(base + align * offset[1]) % 16 == 0);
64 t1 = _mm_load_pd(base + align * offset[0]);
65 t2 = _mm_load_pd(base + align * offset[1]);
66 t3 = _mm_load_pd(base + align * offset[0] + 2);
67 t4 = _mm_load_pd(base + align * offset[1] + 2);
68 v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
69 v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
70 v2->simdInternal_ = _mm_unpacklo_pd(t3, t4);
71 v3->simdInternal_ = _mm_unpackhi_pd(t3, t4);
75 static inline void gmx_simdcall
76 gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
80 assert(std::size_t(base + align * offset[0]) % 16 == 0);
81 assert(std::size_t(base + align * offset[1]) % 16 == 0);
83 t1 = _mm_load_pd(base + align * offset[0]);
84 t2 = _mm_load_pd(base + align * offset[1]);
85 v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
86 v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
89 static const int c_simdBestPairAlignmentDouble = 2;
92 static inline void gmx_simdcall gatherLoadUTranspose(const double* base,
93 const std::int32_t offset[],
98 __m128d t1, t2, t3, t4;
99 t1 = _mm_loadu_pd(base + align * offset[0]);
100 t2 = _mm_loadu_pd(base + align * offset[1]);
101 t3 = _mm_load_sd(base + align * offset[0] + 2);
102 t4 = _mm_load_sd(base + align * offset[1] + 2);
103 v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
104 v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
105 v2->simdInternal_ = _mm_unpacklo_pd(t3, t4);
109 static inline void gmx_simdcall transposeScatterStoreU(double* base,
110 const std::int32_t offset[],
116 t1 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
117 t2 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
118 _mm_storeu_pd(base + align * offset[0], t1);
119 _mm_store_sd(base + align * offset[0] + 2, v2.simdInternal_);
120 _mm_storeu_pd(base + align * offset[1], t2);
121 _mm_storeh_pi(reinterpret_cast<__m64*>(base + align * offset[1] + 2), _mm_castpd_ps(v2.simdInternal_));
125 static inline void gmx_simdcall
126 transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
128 __m128d t1, t2, t3, t4, t5, t6, t7;
130 t5 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
131 t6 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
132 t7 = _mm_unpackhi_pd(v2.simdInternal_, v2.simdInternal_);
134 t1 = _mm_loadu_pd(base + align * offset[0]);
135 t2 = _mm_load_sd(base + align * offset[0] + 2);
136 t1 = _mm_add_pd(t1, t5);
137 t2 = _mm_add_sd(t2, v2.simdInternal_);
138 _mm_storeu_pd(base + align * offset[0], t1);
139 _mm_store_sd(base + align * offset[0] + 2, t2);
141 t3 = _mm_loadu_pd(base + align * offset[1]);
142 t4 = _mm_load_sd(base + align * offset[1] + 2);
143 t3 = _mm_add_pd(t3, t6);
144 t4 = _mm_add_sd(t4, t7);
145 _mm_storeu_pd(base + align * offset[1], t3);
146 _mm_store_sd(base + align * offset[1] + 2, t4);
150 static inline void gmx_simdcall
151 transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
153 // This implementation is identical to the increment version, apart from using subtraction instead
154 __m128d t1, t2, t3, t4, t5, t6, t7;
156 t5 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
157 t6 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
158 t7 = _mm_unpackhi_pd(v2.simdInternal_, v2.simdInternal_);
160 t1 = _mm_loadu_pd(base + align * offset[0]);
161 t2 = _mm_load_sd(base + align * offset[0] + 2);
162 t1 = _mm_sub_pd(t1, t5);
163 t2 = _mm_sub_sd(t2, v2.simdInternal_);
164 _mm_storeu_pd(base + align * offset[0], t1);
165 _mm_store_sd(base + align * offset[0] + 2, t2);
167 t3 = _mm_loadu_pd(base + align * offset[1]);
168 t4 = _mm_load_sd(base + align * offset[1] + 2);
169 t3 = _mm_sub_pd(t3, t6);
170 t4 = _mm_sub_sd(t4, t7);
171 _mm_storeu_pd(base + align * offset[1], t3);
172 _mm_store_sd(base + align * offset[1] + 2, t4);
175 // Override for AVX-128-FMA and higher
176 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
177 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble scalar,
178 SimdDouble* triplets0,
179 SimdDouble* triplets1,
180 SimdDouble* triplets2)
182 triplets0->simdInternal_ =
183 _mm_shuffle_pd(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE2(0, 0));
184 triplets1->simdInternal_ =
185 _mm_shuffle_pd(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE2(1, 0));
186 triplets2->simdInternal_ =
187 _mm_shuffle_pd(scalar.simdInternal_, scalar.simdInternal_, _MM_SHUFFLE2(1, 1));
193 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
200 __m128d t1, t2, t3, t4;
201 // Use optimized bit-shift multiply for the most common alignments
204 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
208 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 3);
210 else if (align == 12)
212 /* multiply by 3, then by 4 */
213 offset.simdInternal_ =
214 _mm_add_epi32(offset.simdInternal_, _mm_slli_epi32(offset.simdInternal_, 1));
215 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
217 else if (align == 16)
219 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 4);
222 if (align == 4 || align == 8 || align == 12 || align == 16)
224 assert(std::size_t(base + extract<0>(offset)) % 16 == 0);
225 assert(std::size_t(base + extract<1>(offset)) % 16 == 0);
227 t1 = _mm_load_pd(base + extract<0>(offset));
228 t2 = _mm_load_pd(base + extract<1>(offset));
229 t3 = _mm_load_pd(base + extract<0>(offset) + 2);
230 t4 = _mm_load_pd(base + extract<1>(offset) + 2);
234 assert(std::size_t(base + align * extract<0>(offset)) % 16 == 0);
235 assert(std::size_t(base + align * extract<1>(offset)) % 16 == 0);
237 t1 = _mm_load_pd(base + align * extract<0>(offset));
238 t2 = _mm_load_pd(base + align * extract<1>(offset));
239 t3 = _mm_load_pd(base + align * extract<0>(offset) + 2);
240 t4 = _mm_load_pd(base + align * extract<1>(offset) + 2);
242 v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
243 v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
244 v2->simdInternal_ = _mm_unpacklo_pd(t3, t4);
245 v3->simdInternal_ = _mm_unpackhi_pd(t3, t4);
249 static inline void gmx_simdcall
250 gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
254 // Use optimized bit-shift multiply for the most common alignments
257 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 1);
261 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
265 // multiply by 3, then by 2
266 offset.simdInternal_ =
267 _mm_add_epi32(offset.simdInternal_, _mm_slli_epi32(offset.simdInternal_, 1));
268 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 1);
272 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 3);
274 else if (align == 12)
276 // multiply by 3, then by 4
277 offset.simdInternal_ =
278 _mm_add_epi32(offset.simdInternal_, _mm_slli_epi32(offset.simdInternal_, 1));
279 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
281 else if (align == 16)
283 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 4);
286 if (align == 2 || align == 4 || align == 6 || align == 8 || align == 12 || align == 16)
288 assert(std::size_t(base + extract<0>(offset)) % 16 == 0);
289 assert(std::size_t(base + extract<1>(offset)) % 16 == 0);
291 t1 = _mm_load_pd(base + extract<0>(offset));
292 t2 = _mm_load_pd(base + extract<1>(offset));
296 assert(std::size_t(base + align * extract<0>(offset)) % 16 == 0);
297 assert(std::size_t(base + align * extract<1>(offset)) % 16 == 0);
299 t1 = _mm_load_pd(base + align * extract<0>(offset));
300 t2 = _mm_load_pd(base + align * extract<1>(offset));
302 v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
303 v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
308 static inline void gmx_simdcall
309 gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
312 // Use optimized bit-shift multiply for the most common alignments.
314 // Do nothing for align == 1
317 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 1);
321 offset.simdInternal_ = _mm_slli_epi32(offset.simdInternal_, 2);
324 if (align == 1 || align == 2 || align == 4)
326 t1 = _mm_loadu_pd(base + extract<0>(offset));
327 t2 = _mm_loadu_pd(base + extract<1>(offset));
331 t1 = _mm_loadu_pd(base + align * extract<0>(offset));
332 t2 = _mm_loadu_pd(base + align * extract<1>(offset));
334 v0->simdInternal_ = _mm_unpacklo_pd(t1, t2);
335 v1->simdInternal_ = _mm_unpackhi_pd(t1, t2);
338 // Override for AVX-128-FMA and higher
339 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
340 static inline double gmx_simdcall
341 reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
343 __m128d t1, t2, t3, t4;
345 t1 = _mm_unpacklo_pd(v0.simdInternal_, v1.simdInternal_);
346 t2 = _mm_unpackhi_pd(v0.simdInternal_, v1.simdInternal_);
347 t3 = _mm_unpacklo_pd(v2.simdInternal_, v3.simdInternal_);
348 t4 = _mm_unpackhi_pd(v2.simdInternal_, v3.simdInternal_);
350 t1 = _mm_add_pd(t1, t2);
351 t3 = _mm_add_pd(t3, t4);
353 t2 = _mm_add_pd(t1, _mm_load_pd(m));
354 t4 = _mm_add_pd(t3, _mm_load_pd(m + 2));
356 assert(std::size_t(m) % 16 == 0);
359 _mm_store_pd(m + 2, t4);
361 t1 = _mm_add_pd(t1, t3);
363 t2 = _mm_add_sd(t1, _mm_shuffle_pd(t1, t1, _MM_SHUFFLE2(1, 1)));
364 return *reinterpret_cast<double*>(&t2);
370 #endif // GMX_SIMD_IMPL_X86_SSE2_UTIL_DOUBLE_H