From: Roland Schulz <roland@rschulz.eu>
Date: Thu, 5 Feb 2015 07:14:36 +0000 (-0500)
Subject: nbnxn utils performance improvement for Phi
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=6785f9ae4305b1f40004065ae783b122404d6937;p=alexxy%2Fgromacs.git

nbnxn utils performance improvement for Phi

Also remove usage of unpack to load half/quarter aligned data, because
in case of misaligned data, instead of SegF it only loaded partial data.

Change-Id: Ib0f7807986e6fcbe998bd6ee41ce104666446321
---

diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
index a7a11677f5..a0c6319284 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -53,7 +53,7 @@ typedef __m512 gmx_mm_hpr; /* high half is ignored */
 static gmx_inline void
 gmx_load_hpr(gmx_mm_hpr *a, const real *b)
 {
-    *a = _mm512_loadunpacklo_ps(_mm512_undefined_ps(), b);
+    *a = _mm512_castpd_ps(_mm512_extload_pd((const double*)b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 }
 
 /* Set all entries in half-width SIMD register *a to b */
@@ -76,13 +76,14 @@ gmx_load1p1_pr(gmx_simd_float_t *a, const real *b)
 static gmx_inline void
 gmx_loaddh_pr(gmx_simd_float_t *a, const real *b)
 {
-    *a = _mm512_permute4f128_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), b), PERM_LOW2HIGH);
+    *a = _mm512_castpd_ps(_mm512_extload_pd((const double*)b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
 }
 
 /* Store half-width SIMD register b into half width aligned memory a */
 static gmx_inline void
 gmx_store_hpr(real *a, gmx_mm_hpr b)
 {
+    assert((size_t)a%32 == 0);
     _mm512_mask_packstorelo_ps(a, mask_loh, b);
 }
 
@@ -102,10 +103,15 @@ gmx_sum4_hpr(gmx_simd_float_t a, gmx_simd_float_t b)
 static gmx_inline __m512
 gmx_mm_transpose_sum4h_pr(gmx_simd_float_t a, gmx_simd_float_t b)
 {
-    return _mm512_setr4_ps(_mm512_mask_reduce_add_ps(mask_loh, a),
-                           _mm512_mask_reduce_add_ps(mask_hih, a),
-                           _mm512_mask_reduce_add_ps(mask_loh, b),
-                           _mm512_mask_reduce_add_ps(mask_hih, b));
+    a = _mm512_add_ps(a, _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB));
+    a = _mm512_add_ps(a, _mm512_swizzle_ps(a, _MM_SWIZ_REG_BADC));
+    a = _mm512_add_ps(a, _mm512_permute4f128_ps(a, _MM_PERM_CDAB));
+
+    b = _mm512_add_ps(b, _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB));
+    b = _mm512_add_ps(b, _mm512_swizzle_ps(b, _MM_SWIZ_REG_BADC));
+    a = _mm512_mask_add_ps(a, _mm512_int2mask(0xF0F0), b, _mm512_permute4f128_ps(b, _MM_PERM_CDAB));
+
+    return _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_setr4_epi32(0, 8, 4, 12), _mm512_castps_si512(a)));
 }
 
 static gmx_inline void
diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h
index 67418f98d0..1840c77275 100644
--- a/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h
+++ b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -38,6 +38,7 @@
 
 #include "config.h"
 
+#include <assert.h>
 #include <math.h>
 
 #include <immintrin.h>
@@ -206,9 +207,9 @@
    Doesn't use mask other than where required. No side effect expected for operating on the (unused) upper 8.
  */
 #define gmx_simd_dint32_t          __m512i
-#define gmx_simd_load_di(m)        _mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m)
+#define gmx_simd_load_di(m)        _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
 #define gmx_simd_set1_di           _mm512_set1_epi32
-#define gmx_simd_store_di(m, a)    _mm512_mask_packstorelo_epi32(m, mask_loh, a)
+#define gmx_simd_store_di          gmx_simd_store_di_mic
 #define gmx_simd_loadu_di          gmx_simd_loadu_di_mic
 #define gmx_simd_storeu_di         gmx_simd_storeu_di_mic
 #define gmx_simd_extract_di        gmx_simd_extract_di_mic
@@ -266,10 +267,10 @@
    set the upper 12 to zero. */
 #define gmx_simd4_float_t           __m512
 #define gmx_simd4_mask              _mm512_int2mask(0xF)
-#define gmx_simd4_load_f(m)         _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m)
+#define gmx_simd4_load_f(m)         _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
 #define gmx_simd4_load1_f(m)        _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
 #define gmx_simd4_set1_f            _mm512_set1_ps
-#define gmx_simd4_store_f(m, a)     _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, a)
+#define gmx_simd4_store_f           gmx_simd4_store_f_mic
 #define gmx_simd4_loadu_f           gmx_simd4_loadu_f_mic
 #define gmx_simd4_storeu_f          gmx_simd4_storeu_f_mic
 #define gmx_simd4_setzero_f         _mm512_setzero_ps
@@ -309,10 +310,10 @@
  ****************************************************/
 #define gmx_simd4_double_t          __m512d
 #define gmx_simd4_mask              _mm512_int2mask(0xF)
-#define gmx_simd4_load_d(m)         _mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m)
+#define gmx_simd4_load_d(m)         _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
 #define gmx_simd4_load1_d(m)        _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
 #define gmx_simd4_set1_d            _mm512_set1_pd
-#define gmx_simd4_store_d(m, a)     _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, a)
+#define gmx_simd4_store_d           gmx_simd4_store_d_mic
 #define gmx_simd4_loadu_d           gmx_simd4_loadu_d_mic
 #define gmx_simd4_storeu_d          gmx_simd4_storeu_d_mic
 #define gmx_simd4_setzero_d         _mm512_setzero_pd
@@ -396,6 +397,13 @@ gmx_simd_storeu_d_mic(double * m, __m512d s)
 }
 
 /* load store dint32 */
+static gmx_inline void gmx_simdcall
+gmx_simd_store_di_mic(gmx_int32_t * m, __m512i s)
+{
+    assert((size_t)m%32 == 0);
+    _mm512_mask_packstorelo_epi32(m, mask_loh, s);
+}
+
 static gmx_inline __m512i gmx_simdcall
 gmx_simd_loadu_di_mic(const gmx_int32_t * m)
 {
@@ -410,6 +418,13 @@ gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s)
 }
 
 /* load store simd4 */
+static gmx_inline void gmx_simdcall
+gmx_simd4_store_f_mic(float * m, __m512 s)
+{
+    assert((size_t)m%16 == 0);
+    _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s);
+}
+
 static gmx_inline __m512 gmx_simdcall
 gmx_simd4_loadu_f_mic(const float * m)
 {
@@ -423,6 +438,13 @@ gmx_simd4_storeu_f_mic(float * m, __m512 s)
     _mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s);
 }
 
+static gmx_inline void gmx_simdcall
+gmx_simd4_store_d_mic(double * m, __m512d s)
+{
+    assert((size_t)m%32 == 0);
+    _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s);
+}
+
 static gmx_inline __m512d gmx_simdcall
 gmx_simd4_loadu_d_mic(const double * m)
 {