/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
static inline Simd4Float gmx_simdcall load4(const float* m)
{
assert(size_t(m) % 16 == 0);
- return { _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m,
- _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE) };
+ return { _mm512_mask_extload_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE) };
}
static inline void gmx_simdcall store4(float* m, Simd4Float a)
{
return { _mm512_mask_loadunpackhi_ps(
_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m),
- _mm512_int2mask(0xF), m + 16) };
+ _mm512_int2mask(0xF),
+ m + 16) };
}
static inline void gmx_simdcall store4U(float* m, Simd4Float a)
static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
{
- return { _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ return { _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(),
+ _mm512_int2mask(0xF),
_mm512_castps_si512(a.simdInternal_),
_mm512_castps_si512(b.simdInternal_))) };
}
static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
{
- return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(
- _mm512_undefined_epi32(), _mm512_int2mask(0xF), _mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
+ return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
+ _mm512_int2mask(0xF),
+ _mm512_castps_si512(a.simdInternal_),
+ _mm512_castps_si512(b.simdInternal_))) };
}
static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
{
- return { _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ return { _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(),
+ _mm512_int2mask(0xF),
_mm512_castps_si512(a.simdInternal_),
_mm512_castps_si512(b.simdInternal_))) };
}
static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
{
- return { _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ return { _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(),
+ _mm512_int2mask(0xF),
_mm512_castps_si512(a.simdInternal_),
_mm512_castps_si512(b.simdInternal_))) };
}
static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
{
- return { _mm512_mask_add_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_,
- b.simdInternal_) };
+ return { _mm512_mask_add_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
}
static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
{
- return { _mm512_mask_sub_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_,
- b.simdInternal_) };
+ return { _mm512_mask_sub_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
}
static inline Simd4Float gmx_simdcall operator-(Simd4Float x)
{
- return { _mm512_mask_addn_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_,
- _mm512_setzero_ps()) };
+ return { _mm512_mask_addn_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_ps()) };
}
static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
{
- return { _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_,
- b.simdInternal_) };
+ return { _mm512_mask_mul_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
}
static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
static inline Simd4Float gmx_simdcall abs(Simd4Float x)
{
- return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(
- _mm512_undefined_epi32(), _mm512_int2mask(0xF),
- _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)), _mm512_castps_si512(x.simdInternal_))) };
+ return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
+ _mm512_int2mask(0xF),
+ _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
+ _mm512_castps_si512(x.simdInternal_))) };
}
static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
{
- return { _mm512_mask_gmax_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_,
- b.simdInternal_) };
+ return { _mm512_mask_gmax_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
}
static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
{
- return { _mm512_mask_gmin_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_,
- b.simdInternal_) };
+ return { _mm512_mask_gmin_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
}
static inline Simd4Float gmx_simdcall round(Simd4Float x)
{
- return { _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_,
- _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
+ return { _mm512_mask_round_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
}
static inline Simd4Float gmx_simdcall trunc(Simd4Float x)
{
- return { _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_,
- _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
+ return { _mm512_mask_round_ps(
+ _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
}
static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
{
- __m512 x = _mm512_mask_mul_ps(_mm512_setzero_ps(), _mm512_int2mask(0x7), a.simdInternal_,
- b.simdInternal_);
- x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
- x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
+ __m512 x = _mm512_mask_mul_ps(
+ _mm512_setzero_ps(), _mm512_int2mask(0x7), a.simdInternal_, b.simdInternal_);
+ x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
+ x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
float f;
_mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
return f;
static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
{
- v0->simdInternal_ = _mm512_mask_permute4f128_ps(v0->simdInternal_, _mm512_int2mask(0x00F0),
- v1->simdInternal_, _MM_PERM_AAAA);
- v2->simdInternal_ = _mm512_mask_permute4f128_ps(v2->simdInternal_, _mm512_int2mask(0x00F0),
- v3->simdInternal_, _MM_PERM_AAAA);
- v0->simdInternal_ = _mm512_mask_permute4f128_ps(v0->simdInternal_, _mm512_int2mask(0xFF00),
- v2->simdInternal_, _MM_PERM_BABA);
+ v0->simdInternal_ = _mm512_mask_permute4f128_ps(
+ v0->simdInternal_, _mm512_int2mask(0x00F0), v1->simdInternal_, _MM_PERM_AAAA);
+ v2->simdInternal_ = _mm512_mask_permute4f128_ps(
+ v2->simdInternal_, _mm512_int2mask(0x00F0), v3->simdInternal_, _MM_PERM_AAAA);
+ v0->simdInternal_ = _mm512_mask_permute4f128_ps(
+ v0->simdInternal_, _mm512_int2mask(0xFF00), v2->simdInternal_, _MM_PERM_BABA);
v0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0),
_mm512_castps_si512(v0->simdInternal_)));
- v1->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F),
- v0->simdInternal_, _MM_PERM_BBBB);
- v2->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F),
- v0->simdInternal_, _MM_PERM_CCCC);
- v3->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F),
- v0->simdInternal_, _MM_PERM_DDDD);
+ v1->simdInternal_ = _mm512_mask_permute4f128_ps(
+ _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_BBBB);
+ v2->simdInternal_ = _mm512_mask_permute4f128_ps(
+ _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_CCCC);
+ v3->simdInternal_ = _mm512_mask_permute4f128_ps(
+ _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_DDDD);
}
// Picky, picky, picky: