--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2013, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_x86_mic_h_
+#define _nbnxn_kernel_simd_utils_x86_mic_h_
+
+typedef gmx_epi32 gmx_exclfilter;
+static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+
+#define nbfp_stride 2
+
+#define PERM_LOW2HIGH _MM_PERM_BABA
+#define PERM_HIGH2LOW _MM_PERM_DCDC
+
+#define mask_loh _mm512_int2mask(0x00FF) /* would be better a constant - but can't initialize with a function call. */
+#define mask_hih _mm512_int2mask(0xFF00)
+
+/* float/double SIMD register type */
+typedef __m512 gmx_mm_pr4;
+
+static gmx_inline gmx_mm_pr4
+gmx_load_pr4(const real *r)
+{
+ return _mm512_loadunpacklo_ps(_mm512_undefined_ps(), r);
+}
+
+static gmx_inline void
+gmx_store_pr4(real *dest, gmx_mm_pr4 src)
+{
+ _mm512_mask_packstorelo_ps(dest, _mm512_int2mask(0xF), src);
+}
+
+static gmx_inline gmx_mm_pr4
+gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
+{
+ return _mm512_add_ps(a, b);
+}
+
+/* Half-width SIMD real type */
+typedef __m512 gmx_mm_hpr; /* high half is ignored */
+
+/* Half-width SIMD operations */
+
+/* Load reals at half-width aligned pointer b into half-width SIMD register a */
+static gmx_inline void
+gmx_load_hpr(gmx_mm_hpr *a, const real *b)
+{
+ *a = _mm512_loadunpacklo_ps(_mm512_undefined_ps(), b);
+}
+
+/* Set all entries in half-width SIMD register *a to b */
+static gmx_inline void
+gmx_set1_hpr(gmx_mm_hpr *a, real b)
+{
+ *a = _mm512_set1_ps(b);
+}
+
+/* Load one real at b and one real at b+1 into halves of a, respectively */
+static gmx_inline void
+gmx_load1p1_pr(gmx_mm_ps *a, const real *b)
+{
+
+ *a = _mm512_mask_extload_ps(_mm512_extload_ps(b, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), mask_hih,
+ b+1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
+}
+
+/* Load reals at half-width aligned pointer b into two halves of a */
+static gmx_inline void
+gmx_loaddh_pr(gmx_mm_ps *a, const real *b)
+{
+ *a = _mm512_permute4f128_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), b), PERM_LOW2HIGH);
+}
+
+/* Store half-width SIMD register b into half width aligned memory a */
+static gmx_inline void
+gmx_store_hpr(real *a, gmx_mm_hpr b)
+{
+ _mm512_mask_packstorelo_ps(a, mask_loh, b);
+}
+
+#define gmx_add_hpr _mm512_add_ps
+#define gmx_sub_hpr _mm512_sub_ps
+
+/* Sum over 4 half SIMD registers */
+static gmx_inline gmx_mm_hpr
+gmx_sum4_hpr(gmx_mm_ps a, gmx_mm_ps b)
+{
+ a = _mm512_add_ps(a, b);
+ b = _mm512_permute4f128_ps(a, PERM_HIGH2LOW);
+ return _mm512_add_ps(a, b);
+}
+
+/* Sum the elements of halfs of each input register and store sums in out */
+static gmx_inline gmx_mm_pr4
+gmx_mm_transpose_sum4h_pr(gmx_mm_ps a, gmx_mm_ps b)
+{
+ return _mm512_setr4_ps(_mm512_mask_reduce_add_ps(mask_loh, a),
+ _mm512_mask_reduce_add_ps(mask_hih, a),
+ _mm512_mask_reduce_add_ps(mask_loh, b),
+ _mm512_mask_reduce_add_ps(mask_hih, b));
+}
+
+static gmx_inline void
+gmx_pr_to_2hpr(gmx_mm_ps a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+{
+ *b = a;
+ *c = _mm512_permute4f128_ps(a, PERM_HIGH2LOW);
+}
+
+static gmx_inline void
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c)
+{
+ *c = _mm512_mask_permute4f128_ps(a, mask_hih, b, PERM_LOW2HIGH);
+}
+
+/* recombine the 2 high half into c */
+static gmx_inline void
+gmx_2hpr_high_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c)
+{
+ *c = _mm512_mask_permute4f128_ps(b, mask_loh, a, PERM_HIGH2LOW);
+}
+
+static gmx_inline void
+gmx_2hepi_to_epi(gmx_epi32 a, gmx_epi32 b, gmx_epi32 *c)
+{
+ *c = _mm512_mask_permute4f128_epi32(a, mask_hih, b, PERM_LOW2HIGH);
+}
+
+/* recombine the 2 high half into c */
+static gmx_inline void
+gmx_2hepi_high_to_epi(gmx_epi32 a, gmx_epi32 b, gmx_epi32 *c)
+{
+ *c = _mm512_mask_permute4f128_epi32(b, mask_loh, a, PERM_HIGH2LOW);
+}
+
+/* Align a stack-based thread-local working array. work-array (currently) not used by load_table_f*/
+static gmx_inline int *
+prepare_table_load_buffer(const int *array)
+{
+ return NULL;
+}
+
+/* Using TAB_FDV0 is slower (for non-analytical PME). For the _mm512_i32gather_ps it helps
+ to have the 16 elements in fewer cache lines. This is why it is faster to do F&D toghether
+ and low/high half after each other, then simply doing a gather for tab_coul_F and tab_coul_F+1.
+ The ording of the 16 elements doesn't matter, so it doesn't help to get FD sorted as odd/even
+ instead of low/high.
+*/
+static gmx_inline void
+load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti,
+ gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S)
+{
+ __m512i idx;
+ __m512i ti1 = _mm512_add_epi32(ti_S, _mm512_set1_epi32(1)); /* incr by 1 for tab1 */
+ gmx_2hepi_to_epi(ti_S, ti1, &idx);
+ __m512 tmp1 = _mm512_i32gather_ps(idx, tab_coul_F, sizeof(float));
+ gmx_2hepi_high_to_epi(ti_S, ti1, &idx);
+ __m512 tmp2 = _mm512_i32gather_ps(idx, tab_coul_F, sizeof(float));
+
+ gmx_2hpr_to_pr(tmp1, tmp2, ctab0_S);
+ gmx_2hpr_high_to_pr(tmp1, tmp2, ctab1_S);
+ *ctab1_S = gmx_sub_pr(*ctab1_S, *ctab0_S);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
+ gmx_epi32 ti_S, int *ti,
+ gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S,
+ gmx_mm_ps *ctabv_S)
+{
+ load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
+ *ctabv_S = _mm512_i32gather_ps(ti_S, tab_coul_V, sizeof(float));
+}
+
+static gmx_inline gmx_mm_pr4
+gmx_mm_transpose_sum4_pr(gmx_mm_ps in0, gmx_mm_ps in1,
+ gmx_mm_ps in2, gmx_mm_ps in3)
+{
+ return _mm512_setr4_ps(_mm512_reduce_add_ps(in0),
+ _mm512_reduce_add_ps(in1),
+ _mm512_reduce_add_ps(in2),
+ _mm512_reduce_add_ps(in3));
+}
+
+/* TODO: Test. Untested by regressiontests (requires input with specific LJ rule, #1373) */
+/* Assumed the same optmization which helps for load_table_f is the fastest here too. */
+static gmx_inline void
+load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
+ const int *type, int aj,
+ gmx_mm_ps *c6_S, gmx_mm_ps *c12_S)
+{
+ __m512i idx0, idx1, idx;
+
+ /* load all 8 unaligned requires 2 load. */
+ idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), type+aj);
+ idx0 = _mm512_loadunpackhi_epi32(idx0, type+aj+16);
+
+ idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1)); /* incr by 1 for c12 */
+
+ gmx_2hepi_to_epi(idx0, idx1, &idx);
+ /* ICC requires nbfp_stride here to be preprocessor constant (not a "const int") */
+ __m512 tmp1 = _mm512_i32gather_ps(idx, nbfp0, sizeof(float)*nbfp_stride);
+
+ gmx_2hepi_high_to_epi(idx0, idx1, &idx);
+ __m512 tmp2 = _mm512_i32gather_ps(idx, nbfp1, sizeof(float)*nbfp_stride);
+
+ gmx_2hpr_to_pr(tmp1, tmp2, c6_S);
+ gmx_2hpr_high_to_pr(tmp1, tmp2, c12_S);
+}
+
+#define HAVE_GMX_SUM_SIMD
+static gmx_inline real
+gmx_sum_simd(gmx_mm_pr x, real* b)
+{
+ return _mm512_reduce_add_ps(x);
+}
+static gmx_inline real
+gmx_sum_simd4(gmx_mm_pr x, real* b)
+{
+ return _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x);
+}
+
+/* Code for handling loading exclusions and converting them into
+ interactions. */
+#define gmx_load1_exclfilter _mm512_set1_epi32
+#define gmx_load_exclusion_filter _mm512_load_epi32
+#define gmx_checkbitmask_pb _mm512_test_epi32_mask
+
+#endif /* _nbnxn_kernel_simd_utils_ref_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2013, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _general_x86_mic_h_
+#define _general_x86_mic_h_
+
+/* This file contains the SIMD implmenetation for Intel MIC
+ */
+
+#include <math.h>
+#include <immintrin.h>
+
+#ifdef GMX_DOUBLE
+#error "Double precision isn't supported on Intel Phi yet"
+#endif
+
+typedef __m512 gmx_mm_ps;
+typedef __m512 gmx_mm_pr;
+/* boolean SIMD register type */
+typedef __mmask16 gmx_mm_pb;
+typedef __m512i gmx_epi32;
+
+#define GMX_HAVE_SIMD_MACROS
+#define GMX_SIMD_WIDTH_HERE 16
+#define GMX_SIMD_EPI32_WIDTH 16
+
+#define gmx_load_pr _mm512_load_ps
+
+/* Set all SIMD register elements to *r */
+static gmx_inline gmx_mm_ps
+gmx_load1_pr(const real *r)
+{
+ return _mm512_extload_ps(r, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
+}
+
+#define gmx_set1_pr _mm512_set1_ps
+/* Set all SIMD register elements to 0 */
+#define gmx_setzero_pr _mm512_setzero_ps
+#define gmx_store_pr _mm512_store_ps
+
+#define gmx_add_pr _mm512_add_ps
+#define gmx_sub_pr _mm512_sub_ps
+#define gmx_mul_pr _mm512_mul_ps
+
+#define GMX_SIMD_HAVE_FMA
+#define gmx_madd_pr _mm512_fmadd_ps
+#define gmx_nmsub_pr _mm512_fnmadd_ps
+
+#define gmx_max_pr _mm512_max_ps
+
+static gmx_inline gmx_mm_ps
+gmx_blendzero_pr(gmx_mm_ps a, gmx_mm_pb b)
+{
+ return _mm512_mask_mov_ps(_mm512_setzero_ps(), b, a);
+}
+
+#define gmx_round_pr _mm512_rint_ps
+
+#define GMX_SIMD_HAVE_FLOOR
+#define gmx_floor_pr _mm512_floor_ps
+
+/* Copy the sign of a to b, assumes b >= 0 for efficiency */
+static gmx_inline gmx_mm_ps
+gmx_cpsgn_nonneg_pr(gmx_mm_ps a, gmx_mm_ps b)
+{
+ __m512 zero = _mm512_setzero_ps();
+ __m512 neg1 = _mm512_set1_ps(-1);
+ /* TODO (only bond): Bitwise operations on floating points can be done after casting to int.
+ That allows us to do it the same way as AVX which might be faster. */
+ return _mm512_mask_mul_ps(b, _mm512_cmplt_ps_mask(a, zero), b, neg1);
+}
+
+/* Very specific operation required in the non-bonded kernels */
+static gmx_inline gmx_mm_ps
+gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_ps b, gmx_mm_ps c)
+{
+ return _mm512_mask_add_ps(b, _mm512_knot(a), b, c);
+}
+
+/* Comparison */
+#define gmx_cmplt_pr _mm512_cmplt_ps_mask
+
+/* Logical AND on SIMD booleans. */
+#define gmx_and_pb _mm512_kand
+
+/* Logical OR on SIMD booleans. */
+#define gmx_or_pb _mm512_kor
+
+/* Returns a single int (0/1) which tells if any of the booleans is True
+ It returns the full mask (not 1 for True). But given that any non-zero is True this is OK. */
+#define gmx_anytrue_pb _mm512_mask2int
+
+/* Conversions only used for PME table lookup */
+static gmx_inline gmx_epi32
+gmx_cvttpr_epi32(gmx_mm_ps a)
+{
+ return _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_ROUND_MODE_DOWN, _MM_EXPADJ_NONE);
+};
+
+/* These two function only need to be approximate, Newton-Raphson iteration
+ * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
+ */
+#define gmx_rsqrt_pr _mm512_rsqrt23_ps
+#define gmx_rcp_pr _mm512_rcp23_ps
+
+#define GMX_SIMD_HAVE_EXP
+#define gmx_exp_pr _mm512_exp_ps
+#define gmx_erfc_pr _mm512_erfc_ps
+
+#define GMX_SIMD_HAVE_TRIGONOMETRIC
+#define gmx_sqrt_pr _mm512_sqrt_ps
+
+static gmx_inline int
+gmx_sincos_pr(gmx_mm_ps a,
+ gmx_mm_ps *s, gmx_mm_ps *c)
+{
+ /* TODO (only bond): optimize that both are calculated together.
+ Or (if if that isn't fast on MIC) don't call sincos if only one is needed. */
+ *s = _mm512_sin_ps(a);
+ *c = _mm512_cos_ps(a);
+ return 0;
+}
+
+#define gmx_acos_pr _mm512_acos_ps
+#define gmx_atan2_pr _mm512_atan2_ps
+
+#endif /* _general_x86_mic_h_ */