/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
static gmx_inline void
gmx_load_hpr(gmx_mm_hpr *a, const real *b)
{
- *a = _mm512_loadunpacklo_ps(_mm512_undefined_ps(), b);
+ *a = _mm512_castpd_ps(_mm512_extload_pd((const double*)b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
}
/* Set all entries in half-width SIMD register *a to b */
static gmx_inline void
gmx_loaddh_pr(gmx_simd_float_t *a, const real *b)
{
- *a = _mm512_permute4f128_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), b), PERM_LOW2HIGH);
+ *a = _mm512_castpd_ps(_mm512_extload_pd((const double*)b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
}
/* Store half-width SIMD register b into half width aligned memory a */
static gmx_inline void
gmx_store_hpr(real *a, gmx_mm_hpr b)
{
+ assert((size_t)a%32 == 0);
_mm512_mask_packstorelo_ps(a, mask_loh, b);
}
static gmx_inline __m512
gmx_mm_transpose_sum4h_pr(gmx_simd_float_t a, gmx_simd_float_t b)
{
- return _mm512_setr4_ps(_mm512_mask_reduce_add_ps(mask_loh, a),
- _mm512_mask_reduce_add_ps(mask_hih, a),
- _mm512_mask_reduce_add_ps(mask_loh, b),
- _mm512_mask_reduce_add_ps(mask_hih, b));
+ a = _mm512_add_ps(a, _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB));
+ a = _mm512_add_ps(a, _mm512_swizzle_ps(a, _MM_SWIZ_REG_BADC));
+ a = _mm512_add_ps(a, _mm512_permute4f128_ps(a, _MM_PERM_CDAB));
+
+ b = _mm512_add_ps(b, _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB));
+ b = _mm512_add_ps(b, _mm512_swizzle_ps(b, _MM_SWIZ_REG_BADC));
+ a = _mm512_mask_add_ps(a, _mm512_int2mask(0xF0F0), b, _mm512_permute4f128_ps(b, _MM_PERM_CDAB));
+
+ return _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_setr4_epi32(0, 8, 4, 12), _mm512_castps_si512(a)));
}
static gmx_inline void
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "config.h"
+#include <assert.h>
#include <math.h>
#include <immintrin.h>
Doesn't use mask other than where required. No side effect expected for operating on the (unused) upper 8.
*/
#define gmx_simd_dint32_t __m512i
-#define gmx_simd_load_di(m) _mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m)
+#define gmx_simd_load_di(m) _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
#define gmx_simd_set1_di _mm512_set1_epi32
-#define gmx_simd_store_di(m, a) _mm512_mask_packstorelo_epi32(m, mask_loh, a)
+#define gmx_simd_store_di gmx_simd_store_di_mic
#define gmx_simd_loadu_di gmx_simd_loadu_di_mic
#define gmx_simd_storeu_di gmx_simd_storeu_di_mic
#define gmx_simd_extract_di gmx_simd_extract_di_mic
set the upper 12 to zero. */
#define gmx_simd4_float_t __m512
#define gmx_simd4_mask _mm512_int2mask(0xF)
-#define gmx_simd4_load_f(m) _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m)
+#define gmx_simd4_load_f(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
#define gmx_simd4_load1_f(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
#define gmx_simd4_set1_f _mm512_set1_ps
-#define gmx_simd4_store_f(m, a) _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, a)
+#define gmx_simd4_store_f gmx_simd4_store_f_mic
#define gmx_simd4_loadu_f gmx_simd4_loadu_f_mic
#define gmx_simd4_storeu_f gmx_simd4_storeu_f_mic
#define gmx_simd4_setzero_f _mm512_setzero_ps
****************************************************/
#define gmx_simd4_double_t __m512d
#define gmx_simd4_mask _mm512_int2mask(0xF)
-#define gmx_simd4_load_d(m) _mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m)
+#define gmx_simd4_load_d(m) _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
#define gmx_simd4_load1_d(m) _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
#define gmx_simd4_set1_d _mm512_set1_pd
-#define gmx_simd4_store_d(m, a) _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, a)
+#define gmx_simd4_store_d gmx_simd4_store_d_mic
#define gmx_simd4_loadu_d gmx_simd4_loadu_d_mic
#define gmx_simd4_storeu_d gmx_simd4_storeu_d_mic
#define gmx_simd4_setzero_d _mm512_setzero_pd
}
/* load store dint32 */
+static gmx_inline void gmx_simdcall
+gmx_simd_store_di_mic(gmx_int32_t * m, __m512i s)
+{
+ assert((size_t)m%32 == 0);
+ _mm512_mask_packstorelo_epi32(m, mask_loh, s);
+}
+
static gmx_inline __m512i gmx_simdcall
gmx_simd_loadu_di_mic(const gmx_int32_t * m)
{
}
/* load store simd4 */
+static gmx_inline void gmx_simdcall
+gmx_simd4_store_f_mic(float * m, __m512 s)
+{
+ assert((size_t)m%16 == 0);
+ _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s);
+}
+
static gmx_inline __m512 gmx_simdcall
gmx_simd4_loadu_f_mic(const float * m)
{
_mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s);
}
+static gmx_inline void gmx_simdcall
+gmx_simd4_store_d_mic(double * m, __m512d s)
+{
+ assert((size_t)m%32 == 0);
+ _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s);
+}
+
static gmx_inline __m512d gmx_simdcall
gmx_simd4_loadu_d_mic(const double * m)
{