2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H
45 #include <immintrin.h>
47 #include "gromacs/math/utilities.h"
49 #include "impl_x86_avx_256_simd_float.h"
60 SimdDouble(double d) : simdInternal_(_mm256_set1_pd(d)) {}
62 // Internal utility constructor to simplify return statements
63 SimdDouble(__m256d simd) : simdInternal_(simd) {}
65 __m256d simdInternal_;
73 SimdDInt32(std::int32_t i) : simdInternal_(_mm_set1_epi32(i)) {}
75 // Internal utility constructor to simplify return statements
76 SimdDInt32(__m128i simd) : simdInternal_(simd) {}
78 __m128i simdInternal_;
86 SimdDBool(bool b) : simdInternal_(_mm256_castsi256_pd(_mm256_set1_epi32(b ? 0xFFFFFFFF : 0))) {}
88 // Internal utility constructor to simplify return statements
89 SimdDBool(__m256d simd) : simdInternal_(simd) {}
91 __m256d simdInternal_;
99 SimdDIBool(bool b) : simdInternal_(_mm_set1_epi32(b ? 0xFFFFFFFF : 0)) {}
101 // Internal utility constructor to simplify return statements
102 SimdDIBool(__m128i simd) : simdInternal_(simd) {}
104 __m128i simdInternal_;
108 static inline SimdDouble gmx_simdcall simdLoad(const double* m, SimdDoubleTag /*unused*/ = {})
110 assert(std::size_t(m) % 32 == 0);
111 return { _mm256_load_pd(m) };
114 static inline void gmx_simdcall store(double* m, SimdDouble a)
116 assert(std::size_t(m) % 32 == 0);
117 _mm256_store_pd(m, a.simdInternal_);
120 static inline SimdDouble gmx_simdcall simdLoadU(const double* m, SimdDoubleTag /*unused*/ = {})
122 return { _mm256_loadu_pd(m) };
125 static inline void gmx_simdcall storeU(double* m, SimdDouble a)
127 _mm256_storeu_pd(m, a.simdInternal_);
130 static inline SimdDouble gmx_simdcall setZeroD()
132 return { _mm256_setzero_pd() };
135 static inline SimdDInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdDInt32Tag /*unused*/)
137 assert(std::size_t(m) % 16 == 0);
138 return { _mm_load_si128(reinterpret_cast<const __m128i*>(m)) };
141 static inline void gmx_simdcall store(std::int32_t* m, SimdDInt32 a)
143 assert(std::size_t(m) % 16 == 0);
144 _mm_store_si128(reinterpret_cast<__m128i*>(m), a.simdInternal_);
147 static inline SimdDInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdDInt32Tag /*unused*/)
149 return { _mm_loadu_si128(reinterpret_cast<const __m128i*>(m)) };
152 static inline void gmx_simdcall storeU(std::int32_t* m, SimdDInt32 a)
154 _mm_storeu_si128(reinterpret_cast<__m128i*>(m), a.simdInternal_);
157 static inline SimdDInt32 gmx_simdcall setZeroDI()
159 return { _mm_setzero_si128() };
163 static inline std::int32_t gmx_simdcall extract(SimdDInt32 a)
165 return _mm_extract_epi32(a.simdInternal_, index);
168 static inline SimdDouble gmx_simdcall operator&(SimdDouble a, SimdDouble b)
170 return { _mm256_and_pd(a.simdInternal_, b.simdInternal_) };
173 static inline SimdDouble gmx_simdcall andNot(SimdDouble a, SimdDouble b)
175 return { _mm256_andnot_pd(a.simdInternal_, b.simdInternal_) };
178 static inline SimdDouble gmx_simdcall operator|(SimdDouble a, SimdDouble b)
180 return { _mm256_or_pd(a.simdInternal_, b.simdInternal_) };
183 static inline SimdDouble gmx_simdcall operator^(SimdDouble a, SimdDouble b)
185 return { _mm256_xor_pd(a.simdInternal_, b.simdInternal_) };
188 static inline SimdDouble gmx_simdcall operator+(SimdDouble a, SimdDouble b)
190 return { _mm256_add_pd(a.simdInternal_, b.simdInternal_) };
193 static inline SimdDouble gmx_simdcall operator-(SimdDouble a, SimdDouble b)
195 return { _mm256_sub_pd(a.simdInternal_, b.simdInternal_) };
198 static inline SimdDouble gmx_simdcall operator-(SimdDouble x)
200 return { _mm256_xor_pd(x.simdInternal_, _mm256_set1_pd(GMX_DOUBLE_NEGZERO)) };
203 static inline SimdDouble gmx_simdcall operator*(SimdDouble a, SimdDouble b)
205 return { _mm256_mul_pd(a.simdInternal_, b.simdInternal_) };
208 // Override for AVX2 and higher
209 #if GMX_SIMD_X86_AVX_256
210 static inline SimdDouble gmx_simdcall fma(SimdDouble a, SimdDouble b, SimdDouble c)
212 return { _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_) };
215 static inline SimdDouble gmx_simdcall fms(SimdDouble a, SimdDouble b, SimdDouble c)
217 return { _mm256_sub_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_) };
220 static inline SimdDouble gmx_simdcall fnma(SimdDouble a, SimdDouble b, SimdDouble c)
222 return { _mm256_sub_pd(c.simdInternal_, _mm256_mul_pd(a.simdInternal_, b.simdInternal_)) };
225 static inline SimdDouble gmx_simdcall fnms(SimdDouble a, SimdDouble b, SimdDouble c)
227 return { _mm256_sub_pd(_mm256_setzero_pd(),
228 _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)) };
232 static inline SimdDouble gmx_simdcall rsqrt(SimdDouble x)
234 return { _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x.simdInternal_))) };
237 static inline SimdDouble gmx_simdcall rcp(SimdDouble x)
239 return { _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x.simdInternal_))) };
242 static inline SimdDouble gmx_simdcall maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
244 return { _mm256_add_pd(a.simdInternal_, _mm256_and_pd(b.simdInternal_, m.simdInternal_)) };
247 static inline SimdDouble gmx_simdcall maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
249 return { _mm256_and_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), m.simdInternal_) };
252 static inline SimdDouble maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
254 return { _mm256_and_pd(_mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_),
258 static inline SimdDouble maskzRsqrt(SimdDouble x, SimdDBool m)
261 x.simdInternal_ = _mm256_blendv_pd(_mm256_set1_pd(1.0), x.simdInternal_, m.simdInternal_);
263 return { _mm256_and_pd(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x.simdInternal_))), m.simdInternal_) };
266 static inline SimdDouble maskzRcp(SimdDouble x, SimdDBool m)
269 x.simdInternal_ = _mm256_blendv_pd(_mm256_set1_pd(1.0), x.simdInternal_, m.simdInternal_);
271 return { _mm256_and_pd(_mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x.simdInternal_))), m.simdInternal_) };
274 static inline SimdDouble gmx_simdcall abs(SimdDouble x)
276 return { _mm256_andnot_pd(_mm256_set1_pd(GMX_DOUBLE_NEGZERO), x.simdInternal_) };
279 static inline SimdDouble gmx_simdcall max(SimdDouble a, SimdDouble b)
281 return { _mm256_max_pd(a.simdInternal_, b.simdInternal_) };
284 static inline SimdDouble gmx_simdcall min(SimdDouble a, SimdDouble b)
286 return { _mm256_min_pd(a.simdInternal_, b.simdInternal_) };
289 static inline SimdDouble gmx_simdcall round(SimdDouble x)
291 return { _mm256_round_pd(x.simdInternal_, _MM_FROUND_NINT) };
294 static inline SimdDouble gmx_simdcall trunc(SimdDouble x)
296 return { _mm256_round_pd(x.simdInternal_, _MM_FROUND_TRUNC) };
299 // Override for AVX2 and higher
300 #if GMX_SIMD_X86_AVX_256
301 static inline SimdDouble frexp(SimdDouble value, SimdDInt32* exponent)
303 const __m256d exponentMask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL));
304 const __m256d mantissaMask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x800FFFFFFFFFFFFFLL));
305 const __m256d half = _mm256_set1_pd(0.5);
306 const __m128i exponentBias = _mm_set1_epi32(1022); // add 1 to make our definition identical to frexp()
308 __m128i iExponentLow, iExponentHigh;
310 iExponent = _mm256_castpd_si256(_mm256_and_pd(value.simdInternal_, exponentMask));
311 iExponentHigh = _mm256_extractf128_si256(iExponent, 0x1);
312 iExponentLow = _mm256_castsi256_si128(iExponent);
313 iExponentLow = _mm_srli_epi64(iExponentLow, 52);
314 iExponentHigh = _mm_srli_epi64(iExponentHigh, 52);
315 iExponentLow = _mm_shuffle_epi32(iExponentLow, _MM_SHUFFLE(1, 1, 2, 0));
316 iExponentHigh = _mm_shuffle_epi32(iExponentHigh, _MM_SHUFFLE(2, 0, 1, 1));
317 iExponentLow = _mm_or_si128(iExponentLow, iExponentHigh);
318 exponent->simdInternal_ = _mm_sub_epi32(iExponentLow, exponentBias);
320 return { _mm256_or_pd(_mm256_and_pd(value.simdInternal_, mantissaMask), half) };
323 template<MathOptimization opt = MathOptimization::Safe>
324 static inline SimdDouble ldexp(SimdDouble value, SimdDInt32 exponent)
326 const __m128i exponentBias = _mm_set1_epi32(1023);
327 __m128i iExponentLow, iExponentHigh;
330 iExponentLow = _mm_add_epi32(exponent.simdInternal_, exponentBias);
332 if (opt == MathOptimization::Safe)
334 // Make sure biased argument is not negative
335 iExponentLow = _mm_max_epi32(iExponentLow, _mm_setzero_si128());
338 iExponentHigh = _mm_shuffle_epi32(iExponentLow, _MM_SHUFFLE(3, 3, 2, 2));
339 iExponentLow = _mm_shuffle_epi32(iExponentLow, _MM_SHUFFLE(1, 1, 0, 0));
340 iExponentHigh = _mm_slli_epi64(iExponentHigh, 52);
341 iExponentLow = _mm_slli_epi64(iExponentLow, 52);
342 fExponent = _mm256_castsi256_pd(
343 _mm256_insertf128_si256(_mm256_castsi128_si256(iExponentLow), iExponentHigh, 0x1));
344 return { _mm256_mul_pd(value.simdInternal_, fExponent) };
348 static inline double gmx_simdcall reduce(SimdDouble a)
351 a.simdInternal_ = _mm256_add_pd(a.simdInternal_, _mm256_permute_pd(a.simdInternal_, 0b0101));
352 a0 = _mm256_castpd256_pd128(a.simdInternal_);
353 a1 = _mm256_extractf128_pd(a.simdInternal_, 0x1);
354 a0 = _mm_add_sd(a0, a1);
356 return *reinterpret_cast<double*>(&a0);
359 static inline SimdDBool gmx_simdcall operator==(SimdDouble a, SimdDouble b)
361 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
364 static inline SimdDBool gmx_simdcall operator!=(SimdDouble a, SimdDouble b)
366 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_NEQ_OQ) };
369 static inline SimdDBool gmx_simdcall operator<(SimdDouble a, SimdDouble b)
371 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LT_OQ) };
374 static inline SimdDBool gmx_simdcall operator<=(SimdDouble a, SimdDouble b)
376 return { _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LE_OQ) };
379 // Override for AVX2 and higher
380 #if GMX_SIMD_X86_AVX_256
381 static inline SimdDBool gmx_simdcall testBits(SimdDouble a)
383 // Do an or of the low/high 32 bits of each double (so the data is replicated),
384 // and then use the same algorithm as we use for single precision.
385 __m256 tst = _mm256_castpd_ps(a.simdInternal_);
387 tst = _mm256_or_ps(tst, _mm256_permute_ps(tst, _MM_SHUFFLE(2, 3, 0, 1)));
388 tst = _mm256_cvtepi32_ps(_mm256_castps_si256(tst));
390 return { _mm256_castps_pd(_mm256_cmp_ps(tst, _mm256_setzero_ps(), _CMP_NEQ_OQ)) };
394 static inline SimdDBool gmx_simdcall operator&&(SimdDBool a, SimdDBool b)
396 return { _mm256_and_pd(a.simdInternal_, b.simdInternal_) };
399 static inline SimdDBool gmx_simdcall operator||(SimdDBool a, SimdDBool b)
401 return { _mm256_or_pd(a.simdInternal_, b.simdInternal_) };
404 static inline bool gmx_simdcall anyTrue(SimdDBool a)
406 return _mm256_movemask_pd(a.simdInternal_) != 0;
409 static inline SimdDouble gmx_simdcall selectByMask(SimdDouble a, SimdDBool mask)
411 return { _mm256_and_pd(a.simdInternal_, mask.simdInternal_) };
414 static inline SimdDouble gmx_simdcall selectByNotMask(SimdDouble a, SimdDBool mask)
416 return { _mm256_andnot_pd(mask.simdInternal_, a.simdInternal_) };
419 static inline SimdDouble gmx_simdcall blend(SimdDouble a, SimdDouble b, SimdDBool sel)
421 return { _mm256_blendv_pd(a.simdInternal_, b.simdInternal_, sel.simdInternal_) };
424 static inline SimdDInt32 gmx_simdcall operator&(SimdDInt32 a, SimdDInt32 b)
426 return { _mm_and_si128(a.simdInternal_, b.simdInternal_) };
429 static inline SimdDInt32 gmx_simdcall andNot(SimdDInt32 a, SimdDInt32 b)
431 return { _mm_andnot_si128(a.simdInternal_, b.simdInternal_) };
434 static inline SimdDInt32 gmx_simdcall operator|(SimdDInt32 a, SimdDInt32 b)
436 return { _mm_or_si128(a.simdInternal_, b.simdInternal_) };
439 static inline SimdDInt32 gmx_simdcall operator^(SimdDInt32 a, SimdDInt32 b)
441 return { _mm_xor_si128(a.simdInternal_, b.simdInternal_) };
444 static inline SimdDInt32 gmx_simdcall operator+(SimdDInt32 a, SimdDInt32 b)
446 return { _mm_add_epi32(a.simdInternal_, b.simdInternal_) };
449 static inline SimdDInt32 gmx_simdcall operator-(SimdDInt32 a, SimdDInt32 b)
451 return { _mm_sub_epi32(a.simdInternal_, b.simdInternal_) };
454 static inline SimdDInt32 gmx_simdcall operator*(SimdDInt32 a, SimdDInt32 b)
456 return { _mm_mullo_epi32(a.simdInternal_, b.simdInternal_) };
459 static inline SimdDIBool gmx_simdcall operator==(SimdDInt32 a, SimdDInt32 b)
461 return { _mm_cmpeq_epi32(a.simdInternal_, b.simdInternal_) };
464 static inline SimdDIBool gmx_simdcall operator<(SimdDInt32 a, SimdDInt32 b)
466 return { _mm_cmplt_epi32(a.simdInternal_, b.simdInternal_) };
469 static inline SimdDIBool gmx_simdcall testBits(SimdDInt32 a)
471 __m128i x = a.simdInternal_;
472 __m128i res = _mm_andnot_si128(_mm_cmpeq_epi32(x, _mm_setzero_si128()), _mm_cmpeq_epi32(x, x));
477 static inline SimdDIBool gmx_simdcall operator&&(SimdDIBool a, SimdDIBool b)
479 return { _mm_and_si128(a.simdInternal_, b.simdInternal_) };
482 static inline SimdDIBool gmx_simdcall operator||(SimdDIBool a, SimdDIBool b)
484 return { _mm_or_si128(a.simdInternal_, b.simdInternal_) };
487 static inline bool gmx_simdcall anyTrue(SimdDIBool a)
489 return _mm_movemask_epi8(a.simdInternal_) != 0;
492 static inline SimdDInt32 gmx_simdcall selectByMask(SimdDInt32 a, SimdDIBool mask)
494 return { _mm_and_si128(a.simdInternal_, mask.simdInternal_) };
497 static inline SimdDInt32 gmx_simdcall selectByNotMask(SimdDInt32 a, SimdDIBool mask)
499 return { _mm_andnot_si128(mask.simdInternal_, a.simdInternal_) };
502 static inline SimdDInt32 gmx_simdcall blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
504 return { _mm_blendv_epi8(a.simdInternal_, b.simdInternal_, sel.simdInternal_) };
507 static inline SimdDInt32 gmx_simdcall cvtR2I(SimdDouble a)
509 return { _mm256_cvtpd_epi32(a.simdInternal_) };
512 static inline SimdDInt32 gmx_simdcall cvttR2I(SimdDouble a)
514 return { _mm256_cvttpd_epi32(a.simdInternal_) };
517 static inline SimdDouble gmx_simdcall cvtI2R(SimdDInt32 a)
519 return { _mm256_cvtepi32_pd(a.simdInternal_) };
522 static inline SimdDIBool gmx_simdcall cvtB2IB(SimdDBool a)
524 __m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a.simdInternal_), 0x1);
525 __m128i a0 = _mm256_castsi256_si128(_mm256_castpd_si256(a.simdInternal_));
526 a0 = _mm_shuffle_epi32(a0, _MM_SHUFFLE(2, 0, 2, 0));
527 a1 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(2, 0, 2, 0));
529 return { _mm_blend_epi16(a0, a1, 0xF0) };
532 static inline SimdDBool gmx_simdcall cvtIB2B(SimdDIBool a)
534 __m128d lo = _mm_castsi128_pd(_mm_unpacklo_epi32(a.simdInternal_, a.simdInternal_));
535 __m128d hi = _mm_castsi128_pd(_mm_unpackhi_epi32(a.simdInternal_, a.simdInternal_));
537 return { _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1) };
540 static inline void gmx_simdcall cvtF2DD(SimdFloat f, SimdDouble* d0, SimdDouble* d1)
542 d0->simdInternal_ = _mm256_cvtps_pd(_mm256_castps256_ps128(f.simdInternal_));
543 d1->simdInternal_ = _mm256_cvtps_pd(_mm256_extractf128_ps(f.simdInternal_, 0x1));
546 static inline SimdFloat gmx_simdcall cvtDD2F(SimdDouble d0, SimdDouble d1)
548 __m128 f0 = _mm256_cvtpd_ps(d0.simdInternal_);
549 __m128 f1 = _mm256_cvtpd_ps(d1.simdInternal_);
550 return { _mm256_insertf128_ps(_mm256_castps128_ps256(f0), f1, 0x1) };
555 #endif // GMX_SIMD_IMPL_X86_AVX_256_SIMD_DOUBLE_H