From: Roland Schulz Date: Thu, 5 Feb 2015 07:14:36 +0000 (-0500) Subject: nbnxn utils performance improvement for Phi X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=6785f9ae4305b1f40004065ae783b122404d6937;p=alexxy%2Fgromacs.git nbnxn utils performance improvement for Phi Also remove usage of unpack to load half/quarter aligned data, because in case of misaligned data, instead of SegF it only loaded partial data. Change-Id: Ib0f7807986e6fcbe998bd6ee41ce104666446321 --- diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h index a7a11677f5..a0c6319284 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013,2014, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -53,7 +53,7 @@ typedef __m512 gmx_mm_hpr; /* high half is ignored */ static gmx_inline void gmx_load_hpr(gmx_mm_hpr *a, const real *b) { - *a = _mm512_loadunpacklo_ps(_mm512_undefined_ps(), b); + *a = _mm512_castpd_ps(_mm512_extload_pd((const double*)b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)); } /* Set all entries in half-width SIMD register *a to b */ @@ -76,13 +76,14 @@ gmx_load1p1_pr(gmx_simd_float_t *a, const real *b) static gmx_inline void gmx_loaddh_pr(gmx_simd_float_t *a, const real *b) { - *a = _mm512_permute4f128_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), b), PERM_LOW2HIGH); + *a = _mm512_castpd_ps(_mm512_extload_pd((const double*)b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)); } /* Store half-width SIMD register b into half width aligned memory a */ static gmx_inline void gmx_store_hpr(real *a, gmx_mm_hpr b) { + assert((size_t)a%32 == 0); _mm512_mask_packstorelo_ps(a, mask_loh, b); } @@ -102,10 +103,15 @@ gmx_sum4_hpr(gmx_simd_float_t a, gmx_simd_float_t b) static gmx_inline __m512 gmx_mm_transpose_sum4h_pr(gmx_simd_float_t a, gmx_simd_float_t b) { - return _mm512_setr4_ps(_mm512_mask_reduce_add_ps(mask_loh, a), - _mm512_mask_reduce_add_ps(mask_hih, a), - _mm512_mask_reduce_add_ps(mask_loh, b), - _mm512_mask_reduce_add_ps(mask_hih, b)); + a = _mm512_add_ps(a, _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB)); + a = _mm512_add_ps(a, _mm512_swizzle_ps(a, _MM_SWIZ_REG_BADC)); + a = _mm512_add_ps(a, _mm512_permute4f128_ps(a, _MM_PERM_CDAB)); + + b = _mm512_add_ps(b, _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB)); + b = _mm512_add_ps(b, _mm512_swizzle_ps(b, _MM_SWIZ_REG_BADC)); + a = _mm512_mask_add_ps(a, _mm512_int2mask(0xF0F0), b, _mm512_permute4f128_ps(b, _MM_PERM_CDAB)); + + return _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_setr4_epi32(0, 8, 4, 12), _mm512_castps_si512(a))); } static gmx_inline void diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h index 67418f98d0..1840c77275 100644 --- a/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h +++ b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014, by the GROMACS development team, led by + * Copyright (c) 2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -38,6 +38,7 @@ #include "config.h" +#include #include #include @@ -206,9 +207,9 @@ Doesn't use mask other than where required. No side effect expected for operating on the (unused) upper 8. */ #define gmx_simd_dint32_t __m512i -#define gmx_simd_load_di(m) _mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m) +#define gmx_simd_load_di(m) _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE) #define gmx_simd_set1_di _mm512_set1_epi32 -#define gmx_simd_store_di(m, a) _mm512_mask_packstorelo_epi32(m, mask_loh, a) +#define gmx_simd_store_di gmx_simd_store_di_mic #define gmx_simd_loadu_di gmx_simd_loadu_di_mic #define gmx_simd_storeu_di gmx_simd_storeu_di_mic #define gmx_simd_extract_di gmx_simd_extract_di_mic @@ -266,10 +267,10 @@ set the upper 12 to zero. */ #define gmx_simd4_float_t __m512 #define gmx_simd4_mask _mm512_int2mask(0xF) -#define gmx_simd4_load_f(m) _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m) +#define gmx_simd4_load_f(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE) #define gmx_simd4_load1_f(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE) #define gmx_simd4_set1_f _mm512_set1_ps -#define gmx_simd4_store_f(m, a) _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, a) +#define gmx_simd4_store_f gmx_simd4_store_f_mic #define gmx_simd4_loadu_f gmx_simd4_loadu_f_mic #define gmx_simd4_storeu_f gmx_simd4_storeu_f_mic #define gmx_simd4_setzero_f _mm512_setzero_ps @@ -309,10 +310,10 @@ ****************************************************/ #define gmx_simd4_double_t __m512d #define gmx_simd4_mask _mm512_int2mask(0xF) -#define gmx_simd4_load_d(m) _mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m) +#define gmx_simd4_load_d(m) _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE) #define gmx_simd4_load1_d(m) _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE) #define gmx_simd4_set1_d _mm512_set1_pd -#define gmx_simd4_store_d(m, a) _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, a) +#define gmx_simd4_store_d gmx_simd4_store_d_mic #define gmx_simd4_loadu_d gmx_simd4_loadu_d_mic #define gmx_simd4_storeu_d gmx_simd4_storeu_d_mic #define gmx_simd4_setzero_d _mm512_setzero_pd @@ -396,6 +397,13 @@ gmx_simd_storeu_d_mic(double * m, __m512d s) } /* load store dint32 */ +static gmx_inline void gmx_simdcall +gmx_simd_store_di_mic(gmx_int32_t * m, __m512i s) +{ + assert((size_t)m%32 == 0); + _mm512_mask_packstorelo_epi32(m, mask_loh, s); +} + static gmx_inline __m512i gmx_simdcall gmx_simd_loadu_di_mic(const gmx_int32_t * m) { @@ -410,6 +418,13 @@ gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s) } /* load store simd4 */ +static gmx_inline void gmx_simdcall +gmx_simd4_store_f_mic(float * m, __m512 s) +{ + assert((size_t)m%16 == 0); + _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s); +} + static gmx_inline __m512 gmx_simdcall gmx_simd4_loadu_f_mic(const float * m) { @@ -423,6 +438,13 @@ gmx_simd4_storeu_f_mic(float * m, __m512 s) _mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s); } +static gmx_inline void gmx_simdcall +gmx_simd4_store_d_mic(double * m, __m512d s) +{ + assert((size_t)m%32 == 0); + _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s); +} + static gmx_inline __m512d gmx_simdcall gmx_simd4_loadu_d_mic(const double * m) {