fiz_S2 = fiz_S2 + tz_S2;
/* Decrement j atom force */
- decr3Hsimd<STRIDE>(f + aj * DIM, tx_S0 + tx_S2, ty_S0 + ty_S2, tz_S0 + tz_S2);
+ decr3Hsimd(f + aj * DIM, tx_S0 + tx_S2, ty_S0 + ty_S2, tz_S0 + tz_S2);
}
#undef rinv_ex_S0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 0
#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
// GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE is conditionally defined further down
-#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // No need for half-simd, width is 2
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 0
//! \brief 1 if float half-register load/store/reduce utils present, otherwise 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-/*! \brief 1 if implementation provides single decr3Hsimd()
- *
- * Only used in simd.h to selectively override the generic implementation.
- */
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
-
//! \brief 1 if double half-register load/store/reduce utils present, otherwise 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
-/*! \brief 1 if implementation provides double decr3Hsimd()
- *
- * Only used in simd.h to selectively override the generic implementation.
- */
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
-
#ifdef GMX_SIMD_REF_FLOAT_WIDTH
# define GMX_SIMD_FLOAT_WIDTH GMX_SIMD_REF_FLOAT_WIDTH
#else
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2017,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
}
}
-/*! \brief Add the two halves of a SIMD double, subtract the sum from
- * half-SIMD-width consecutive doubles in memory.
+/*! \brief Add the two halves of three SIMD doubles, subtract the sum from
+ * three half-SIMD-width consecutive doubles in memory.
*
* \param m half-width aligned memory, from which sum of the halves will be subtracted.
- * \param a SIMD variable. Upper & lower halves will first be added.
+ * \param a0 SIMD variable. Upper & lower halves will first be added.
+ * \param a1 SIMD variable. Upper & lower halves will second be added.
+ * \param a2 SIMD variable. Upper & lower halves will third be added.
*
- * If the SIMD width is 8 and contains [a b c d e f g h], the
- * memory will be modified to [m[0]-(a+e) m[1]-(b+f) m[2]-(c+g) m[3]-(d+h)].
+ * If the SIMD width is 8 and the vectors contain [a0 b0 c0 d0 e0 f0 g0 h0],
+ * [a1 b1 c1 d1 e1 f1 g1 g1] and [a2 b2 c2 d2 e2 f2 g2 h2], the
+ * memory will be modified to [m[0]-(a0+e0) m[1]-(b0+f0) m[2]-(c0+g0) m[3]-(d0+h0)
+ * m[4]-(a1+e1) m[5]-(b1+f1) m[6]-(c1+g1) m[7]-(d1+h1)
+ * m[8]-(a2+e2) m[9]-(b2+f2) m[10]-(c2+g2) m[11]-(d2+h2)].
*
* The memory must be aligned to half SIMD width.
*
* Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
*/
-static inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
+static inline void gmx_simdcall decr3Hsimd(double* m, SimdDouble a0, SimdDouble a1, SimdDouble a2)
{
- // Make sure the memory pointer is aligned to half double SIMD width
assert(std::size_t(m) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
-
- for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
+ for (std::size_t i = 0; i < a0.simdInternal_.size() / 2; i++)
+ {
+ m[i] -= a0.simdInternal_[i] + a0.simdInternal_[a0.simdInternal_.size() / 2 + i];
+ }
+ for (std::size_t i = 0; i < a1.simdInternal_.size() / 2; i++)
+ {
+ m[a1.simdInternal_.size() / 2 + i] -=
+ a1.simdInternal_[i] + a1.simdInternal_[a1.simdInternal_.size() / 2 + i];
+ }
+ for (std::size_t i = 0; i < a2.simdInternal_.size() / 2; i++)
{
- m[i] -= a.simdInternal_[i] + a.simdInternal_[a.simdInternal_.size() / 2 + i];
+ m[a2.simdInternal_.size() + i] -=
+ a2.simdInternal_[i] + a2.simdInternal_[a2.simdInternal_.size() / 2 + i];
}
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2017,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
}
}
-/*! \brief Add the two halves of a SIMD float, subtract the sum from
- * half-SIMD-width consecutive floats in memory.
+/*! \brief Add the two halves of three SIMD floats, subtract the sum from
+ * three half-SIMD-width consecutive floats in memory.
*
* \param m half-width aligned memory, from which sum of the halves will be subtracted.
- * \param a SIMD variable. Upper & lower halves will first be added.
+ * \param a0 SIMD variable. Upper & lower halves will first be added.
+ * \param a1 SIMD variable. Upper & lower halves will second be added.
+ * \param a2 SIMD variable. Upper & lower halves will third be added.
*
- * If the SIMD width is 8 and contains [a b c d e f g h], the
- * memory will be modified to [m[0]-(a+e) m[1]-(b+f) m[2]-(c+g) m[3]-(d+h)].
+ * If the SIMD width is 8 and the vectors contain [a0 b0 c0 d0 e0 f0 g0 h0],
+ * [a1 b1 c1 d1 e1 f1 g1 g1] and [a2 b2 c2 d2 e2 f2 g2 h2], the
+ * memory will be modified to [m[0]-(a0+e0) m[1]-(b0+f0) m[2]-(c0+g0) m[3]-(d0+h0)
+ * m[4]-(a1+e1) m[5]-(b1+f1) m[6]-(c1+g1) m[7]-(d1+h1)
+ * m[8]-(a2+e2) m[9]-(b2+f2) m[10]-(c2+g2) m[11]-(d2+h2)].
*
* The memory must be aligned to half SIMD width.
*
* Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
*/
-static inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
+static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
{
- // Make sure the memory pointer is aligned to half float SIMD width
assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
-
- for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
+ for (std::size_t i = 0; i < a0.simdInternal_.size() / 2; i++)
+ {
+ m[i] -= a0.simdInternal_[i] + a0.simdInternal_[a0.simdInternal_.size() / 2 + i];
+ }
+ for (std::size_t i = 0; i < a1.simdInternal_.size() / 2; i++)
+ {
+ m[a1.simdInternal_.size() / 2 + i] -=
+ a1.simdInternal_[i] + a1.simdInternal_[a1.simdInternal_.size() / 2 + i];
+ }
+ for (std::size_t i = 0; i < a2.simdInternal_.size() / 2; i++)
{
- m[i] -= a.simdInternal_[i] + a.simdInternal_[a.simdInternal_.size() / 2 + i];
+ m[a2.simdInternal_.size() + i] -=
+ a2.simdInternal_[i] + a2.simdInternal_[a2.simdInternal_.size() / 2 + i];
}
}
#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // No need for half-simd, width is 2
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // Not needed for width 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // No need for half-simd, width is 2
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1 // Uses 256-bit avx for SIMD4-double
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // Not needed for width 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
#define GMX_SIMD4_HAVE_FLOAT 1
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
namespace gmx
{
+namespace
+{
+/* This is an internal helper function used by decr3Hsimd(...).
+ */
+inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
+{
+ assert(std::size_t(m) % 16 == 0);
+ __m128 asum = _mm_add_ps(_mm256_castps256_ps128(a.simdInternal_),
+ _mm256_extractf128_ps(a.simdInternal_, 0x1));
+ _mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
+}
+} // namespace
+
/* This is an internal helper function used by the three functions storing,
* incrementing, or decrementing data. Do NOT use it outside this file.
*
_mm_store_ps(m1, _mm_add_ps(_mm256_extractf128_ps(a.simdInternal_, 0x1), _mm_load_ps(m1)));
}
-static inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
+static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
{
assert(std::size_t(m) % 16 == 0);
- __m128 asum = _mm_add_ps(_mm256_castps256_ps128(a.simdInternal_),
- _mm256_extractf128_ps(a.simdInternal_, 0x1));
- _mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
+ decrHsimd(m, a0);
+ decrHsimd(m + GMX_SIMD_FLOAT_WIDTH / 2, a1);
+ decrHsimd(m + GMX_SIMD_FLOAT_WIDTH, a2);
}
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2014-2018, The GROMACS development team.
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{
// Nothing to do. Termination of recursion.
}
+
+/* This is an internal helper function used by decr3Hsimd(...).
+ */
+inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
+{
+ __m256d t;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ a.simdInternal_ = _mm512_add_pd(a.simdInternal_,
+ _mm512_shuffle_f64x2(a.simdInternal_, a.simdInternal_, 0xEE));
+ t = _mm256_load_pd(m);
+ t = _mm256_sub_pd(t, _mm512_castpd512_pd256(a.simdInternal_));
+ _mm256_store_pd(m, t);
+}
} // namespace
_mm256_store_pd(m1, x);
}
-static inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
+static inline void gmx_simdcall decr3Hsimd(double* m, SimdDouble a0, SimdDouble a1, SimdDouble a2)
{
- __m256d t;
-
- assert(std::size_t(m) % 32 == 0);
-
- a.simdInternal_ = _mm512_add_pd(a.simdInternal_,
- _mm512_shuffle_f64x2(a.simdInternal_, a.simdInternal_, 0xEE));
- t = _mm256_load_pd(m);
- t = _mm256_sub_pd(t, _mm512_castpd512_pd256(a.simdInternal_));
- _mm256_store_pd(m, t);
+ decrHsimd(m, a0);
+ decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH / 2, a1);
+ decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH, a2);
}
-
template<int align>
static inline void gmx_simdcall gatherLoadTransposeHsimd(const double* base0,
const double* base1,
{
// Nothing to do. Termination of recursion.
}
+
+/* This is an internal helper function used by decr3Hsimd(...).
+ */
+inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
+{
+ __m256 t;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ a.simdInternal_ = _mm512_add_ps(a.simdInternal_,
+ _mm512_shuffle_f32x4(a.simdInternal_, a.simdInternal_, 0xEE));
+ t = _mm256_load_ps(m);
+ t = _mm256_sub_ps(t, _mm512_castps512_ps256(a.simdInternal_));
+ _mm256_store_ps(m, t);
+}
} // namespace
template<int align, typename... Targs>
_mm256_store_ps(m1, x);
}
-static inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
+static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
{
- __m256 t;
-
- assert(std::size_t(m) % 32 == 0);
-
- a.simdInternal_ = _mm512_add_ps(a.simdInternal_,
- _mm512_shuffle_f32x4(a.simdInternal_, a.simdInternal_, 0xEE));
- t = _mm256_load_ps(m);
- t = _mm256_sub_ps(t, _mm512_castps512_ps256(a.simdInternal_));
- _mm256_store_ps(m, t);
+ decrHsimd(m, a0);
+ decrHsimd(m + GMX_SIMD_FLOAT_WIDTH / 2, a1);
+ decrHsimd(m + GMX_SIMD_FLOAT_WIDTH, a2);
}
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1
namespace gmx
{
+namespace
+{
+/* This is an internal helper function used by decr3Hsimd(...).
+ */
+inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
+{
+ __m512d t;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ t = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+ a.simdInternal_ = _mm512_add_pd(
+ a.simdInternal_,
+ _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
+ t = _mm512_sub_pd(t, a.simdInternal_);
+ _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
+}
+} // namespace
+
// On MIC it is better to use scatter operations, so we define the load routines
// that use a SIMD offset variable first.
_mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
}
-static inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
+static inline void gmx_simdcall decr3Hsimd(double* m, SimdDouble a0, SimdDouble a1, SimdDouble a2)
{
- __m512d t;
-
assert(std::size_t(m) % 32 == 0);
-
- t = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
- a.simdInternal_ = _mm512_add_pd(
- a.simdInternal_,
- _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
- t = _mm512_sub_pd(t, a.simdInternal_);
- _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
+ decrHsimd(m, a0);
+ decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH / 2, a1);
+ decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH, a2);
}
namespace gmx
{
+namespace
+{
+/* This is an internal helper function used by decr3Hsimd(...).
+ */
+inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
+{
+ __m512 t;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE,
+ _MM_BROADCAST_4X8, _MM_HINT_NONE));
+ a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
+ t = _mm512_sub_ps(t, a.simdInternal_);
+ _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
+}
+} // namespace
+
// On MIC it is better to use scatter operations, so we define the load routines
// that use a SIMD offset variable first.
_mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
}
-static inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
+static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
{
- __m512 t;
-
assert(std::size_t(m) % 32 == 0);
-
- t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE,
- _MM_BROADCAST_4X8, _MM_HINT_NONE));
- a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
- t = _mm512_sub_ps(t, a.simdInternal_);
- _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
+ decrHsimd(m, a0);
+ decrHsimd(m + GMX_SIMD_FLOAT_WIDTH / 2, a1);
+ decrHsimd(m + GMX_SIMD_FLOAT_WIDTH, a2);
}
#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // No need for half-simd, width is 2
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 0
#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT 0
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // No need for half-simd, width is 2
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE 0
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 0
# define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_REAL \
GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE
# define GMX_SIMD_HAVE_HSIMD_UTIL_REAL GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE
-# define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_REAL GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE
# define GMX_SIMD4_HAVE_REAL GMX_SIMD4_HAVE_DOUBLE
#else // GMX_DOUBLE
*/
# define GMX_SIMD_HAVE_HSIMD_UTIL_REAL GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT
-/*! \brief 1 if a native decr3Hsimd() implementation is available, otherwise 0
- *
- * \ref GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_DOUBLE if GMX_DOUBLE is 1, otherwise
- * \ref GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT.
- */
-# define GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_REAL GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_FLOAT
-
/*! \brief 1 if Simd4Real is available, otherwise 0.
*
* \ref GMX_SIMD4_HAVE_DOUBLE if GMX_DOUBLE is 1, otherwise \ref GMX_SIMD4_HAVE_FLOAT.
# define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
#endif
-#if GMX_SIMD_HAVE_HSIMD_UTIL_REAL && !GMX_SIMD_HAVE_HSIMD_UTIL_DECR3_REAL
-template<int stride>
-static inline void gmx_simdcall decr3Hsimd(real* m, SimdReal r0, SimdReal r1, SimdReal r2)
-{
- decrHsimd(m, r0);
- decrHsimd(m + stride, r1);
- decrHsimd(m + 2 * stride, r2);
-}
-#endif
-
#if GMX_DOUBLE
# define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
#else
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2015,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2015,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
}
}
-TEST_F(SimdFloatingpointUtilTest, decrHsimd)
+TEST_F(SimdFloatingpointUtilTest, decr3Hsimd)
{
- SimdReal v0;
- real ref[GMX_SIMD_REAL_WIDTH / 2];
- int i;
+ SimdReal v0, v1, v2;
+ real ref[3 * GMX_SIMD_REAL_WIDTH / 2];
+ int i, j;
FloatingPointTolerance tolerance(defaultRealTolerance());
// Point p to the upper half of val1_
{
ref[i] = val0_[i] - (val1_[i] + p[i]);
}
+ p = val2_ + GMX_SIMD_REAL_WIDTH / 2;
+ for (j = 0; j < GMX_SIMD_REAL_WIDTH / 2; i++, j++)
+ {
+ ref[i] = val0_[i] - (val2_[j] + p[j]);
+ }
+ p = val3_ + GMX_SIMD_REAL_WIDTH / 2;
+ for (j = 0; j < GMX_SIMD_REAL_WIDTH / 2; i++, j++)
+ {
+ ref[i] = val0_[i] - (val3_[j] + p[j]);
+ }
v0 = load<SimdReal>(val1_);
- decrHsimd(val0_, v0);
+ v1 = load<SimdReal>(val2_);
+ v2 = load<SimdReal>(val3_);
+ decr3Hsimd(val0_, v0, v1, v2);
- for (i = 0; i < GMX_SIMD_REAL_WIDTH / 2; i++)
+ for (i = 0; i < 3 * GMX_SIMD_REAL_WIDTH / 2; i++)
{
EXPECT_REAL_EQ_TOL(ref[i], val0_[i], tolerance);
}