Extended SIMD, implementation for Intel MIC
authorErik Lindahl <erik@kth.se>
Mon, 6 Jul 2015 20:10:43 +0000 (22:10 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Sun, 27 Dec 2015 12:49:13 +0000 (13:49 +0100)
Passes both unit and regression tests in both single and double
with icc-16 and MPSS 3.5, although the unit tests result in a
bunch of warnings about undefined preprocessing identifiers in GTEST.

Change-Id: If032d760322ff397596090172871f3faba74048d

16 files changed:
docs/doxygen/suppressions.txt
src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd4_double.h [deleted file]
src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd4_float.h [deleted file]
src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd_double.h [deleted file]
src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd_float.h [deleted file]
src/gromacs/simd/impl_x86_mic/impl_x86_mic.h [new file with mode: 0644]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_definitions.h [moved from src/gromacs/simd/impl_intel_mic/impl_intel_mic_common.h with 74% similarity]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_general.h [moved from src/gromacs/simd/impl_intel_mic/impl_intel_mic.h with 85% similarity]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h [new file with mode: 0644]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_float.h [new file with mode: 0644]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h [new file with mode: 0644]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_float.h [new file with mode: 0644]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h [new file with mode: 0644]
src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h [new file with mode: 0644]
src/gromacs/simd/simd.h
src/gromacs/simd/tests/simd_floatingpoint_util.cpp

index e309b7e19e10869937b38d52e8d1a6d98d21229d..50aac3daee54a008cb7477679b2f9ecaa12f3bd1 100644 (file)
@@ -46,7 +46,6 @@ src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.cpp: warning: inc
 src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h: warning: should include "nbnxn_simd.h"
 
 # Temporary while we change the SIMD implementation
-src/gromacs/simd/impl_intel_mic/impl_intel_mic_common.h: warning: should include "simd.h"
 src/gromacs/simd/impl_sparc64_hpc_ace/impl_sparc64_hpc_ace_common.h: warning: should include "simd.h"
 src/gromacs/simd/impl_x86_avx_512f/impl_x86_avx_512f_common.h: warning: should include "simd.h"
 
diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd4_double.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd4_double.h
deleted file mode 100644 (file)
index 395abbb..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD4_DOUBLE_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD4_DOUBLE_H
-
-#include "config.h"
-
-#include <assert.h>
-#include <math.h>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-#include "impl_intel_mic_simd_double.h"
-
-/****************************************************
- *      DOUBLE PRECISION SIMD4 IMPLEMENTATION       *
- ****************************************************/
-#define Simd4Double          __m512d
-#define simd4Mask              _mm512_int2mask(0xF)
-#define simd4LoadD(m)         _mm512_mask_extload_pd(_mm512_undefined_pd(), simd4Mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
-#define simd4Load1D(m)        _mm512_mask_extload_pd(_mm512_undefined_pd(), simd4Mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
-#define simd4Set1D            _mm512_set1_pd
-#define simd4StoreD           simd4StoreD_mic
-#define simd4LoadUD           simd4LoadUD_mic
-#define simd4StoreUD          simd4StoreUD_mic
-#define simd4SetZeroD         _mm512_setzero_pd
-#define simd4AddD(a, b)       _mm512_mask_add_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4SubD(a, b)       _mm512_mask_sub_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4MulD(a, b)       _mm512_mask_mul_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4FmaddD(a, b, c)  _mm512_mask_fmadd_pd(a, simd4Mask, b, c)
-#define simd4FmsubD(a, b, c)  _mm512_mask_fmsub_pd(a, simd4Mask, b, c)
-#define simd4FnmaddD(a, b, c) _mm512_mask_fnmadd_pd(a, simd4Mask, b, c)
-#define simd4FnmsubD(a, b, c) _mm512_mask_fnmsub_pd(a, simd4Mask, b, c)
-#define simd4AndD(a, b)       _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4AndNotD(a, b)    _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4OrD(a, b)        _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4XorD(a, b)       _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4RsqrtD(a)        _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(), simd4Mask, _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), simd4Mask, _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(), simd4Mask, x)))
-#define simd4AbsD(x)         simd4AndNotD(_mm512_set1_pd(GMX_DOUBLE_NEGZERO), x)
-#define simd4NegD(x)         _mm512_mask_addn_pd(_mm512_undefined_pd(), simd4Mask, x, _mm512_setzero_pd())
-#define simd4MaxD(a, b)       _mm512_mask_gmax_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4MinD(a, b)       _mm512_mask_gmin_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4RoundD(a)        _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), simd4Mask, a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simd4TruncD(a)        _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), simd4Mask, a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simd4DotProductD(a, b) _mm512_mask_reduce_add_pd(_mm512_int2mask(7), _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7), a, b))
-#define Simd4DBool           __mmask16
-#define simd4CmpEqD(a, b)     _mm512_mask_cmp_pd_mask(simd4Mask, a, b, _CMP_EQ_OQ)
-#define simd4CmpLtD(a, b)     _mm512_mask_cmp_pd_mask(simd4Mask, a, b, _CMP_LT_OS)
-#define simd4CmpLeD(a, b)     _mm512_mask_cmp_pd_mask(simd4Mask, a, b, _CMP_LE_OS)
-#define simd4AndDB            _mm512_kand
-#define simd4OrDB             _mm512_kor
-#define simd4AnyTrueDB(x)     (_mm512_mask2int(x)&0xF)
-#define simd4MaskD(a, sel)    _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
-#define simd4MaskNotD(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
-#define simd4BlendD(a, b, sel)    _mm512_mask_blend_pd(sel, a, b)
-#define simd4ReduceD(x)       _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), x)
-
-/* Implementation helpers */
-static inline void gmx_simdcall
-simd4StoreD_mic(double * m, __m512d s)
-{
-    assert((size_t)m%32 == 0);
-    _mm512_mask_packstorelo_pd(m, simd4Mask, s);
-}
-
-static inline __m512d gmx_simdcall
-simd4LoadUD_mic(const double * m)
-{
-    return _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), simd4Mask, m), simd4Mask, m+8);
-}
-
-static inline void gmx_simdcall
-simd4StoreUD_mic(double * m, __m512d s)
-{
-    _mm512_mask_packstorelo_pd(m, simd4Mask, s);
-    _mm512_mask_packstorehi_pd(m+8, simd4Mask, s);
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD4_DOUBLE_H */
diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd4_float.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd4_float.h
deleted file mode 100644 (file)
index 4a4fdd5..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
-
-#include "config.h"
-
-#include <assert.h>
-#include <math.h>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-#include "impl_intel_mic_simd_float.h"
-
-/****************************************************
- *      SINGLE PRECISION SIMD4 IMPLEMENTATION       *
- ****************************************************/
-/* Load and store are guranteed to only access the 4 floats. All arithmetic operations
-   only operate on the 4 elements (to avoid floating excpetions). But other operations
-   are not gurateed to not modify the other 12 elements. E.g. setzero or blendzero
-   set the upper 12 to zero. */
-#define Simd4Float           __m512
-#define simd4Mask              _mm512_int2mask(0xF)
-#define simd4LoadF(m)         _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
-#define simd4Load1F(m)        _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
-#define simd4Set1F            _mm512_set1_ps
-#define simd4StoreF           simd4StoreF_mic
-#define simd4LoadUF           simd4LoadUF_mic
-#define simd4StoreUF          simd4StoreUF_mic
-#define simd4SetZeroF         _mm512_setzero_ps
-#define simd4AddF(a, b)       _mm512_mask_add_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4SubF(a, b)       _mm512_mask_sub_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4MulF(a, b)       _mm512_mask_mul_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4FmaddF(a, b, c)  _mm512_mask_fmadd_ps(a, simd4Mask, b, c)
-#define simd4FmsubF(a, b, c)  _mm512_mask_fmsub_ps(a, simd4Mask, b, c)
-#define simd4FnmaddF(a, b, c) _mm512_mask_fnmadd_ps(a, simd4Mask, b, c)
-#define simd4FnmsubF(a, b, c) _mm512_mask_fnmsub_ps(a, simd4Mask, b, c)
-#define simd4AndF(a, b)       _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4AndNotF(a, b)    _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4OrF(a, b)        _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4XorF(a, b)       _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4RsqrtF(a)        _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), simd4Mask, a)
-#define simd4AbsF(x)         simd4AndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
-#define simd4NegF(x)         _mm512_mask_addn_ps(_mm512_undefined_ps(), simd4Mask, x, _mm512_setzero_ps())
-#define simd4MaxF(a, b)       _mm512_mask_gmax_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4MinF(a, b)       _mm512_mask_gmin_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4RoundF(x)        _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simd4TruncF(x)        _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simd4DotProductF(a, b) _mm512_mask_reduce_add_ps(_mm512_int2mask(7), _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(7), a, b))
-#define Simd4FBool           __mmask16
-#define simd4CmpEqF(a, b)     _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_EQ_OQ)
-#define simd4CmpLtF(a, b)     _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LT_OS)
-#define simd4CmpLeF(a, b)     _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LE_OS)
-#define simd4AndFB            _mm512_kand
-#define simd4OrFB             _mm512_kor
-#define simd4AnyTrueFB(x)     (_mm512_mask2int(x)&0xF)
-#define simd4MaskF(a, sel)    _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
-#define simd4MaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
-#define simd4BlendF(a, b, sel)    _mm512_mask_blend_ps(sel, a, b)
-#define simd4ReduceF(x)       _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x)
-
-/* Implementation helpers */
-
-/* load store simd4 */
-static inline void gmx_simdcall
-simd4StoreF_mic(float * m, __m512 s)
-{
-    assert((size_t)m%16 == 0);
-    _mm512_mask_packstorelo_ps(m, simd4Mask, s);
-}
-
-static inline __m512 gmx_simdcall
-simd4LoadUF_mic(const float * m)
-{
-    return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), simd4Mask, m), simd4Mask, m+16);
-}
-
-static inline void gmx_simdcall
-simd4StoreUF_mic(float * m, __m512 s)
-{
-    _mm512_mask_packstorelo_ps(m, simd4Mask, s);
-    _mm512_mask_packstorehi_ps(m+16, simd4Mask, s);
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H */
diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd_double.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd_double.h
deleted file mode 100644 (file)
index 3ffed2e..0000000
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD_DOUBLE_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD_DOUBLE_H
-
-#include "config.h"
-
-#include <assert.h>
-
-#include <cmath>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-
-/****************************************************
- *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
- ****************************************************/
-#define SimdDouble          __m512d
-#define simdLoadD            _mm512_load_pd
-#define simdLoad1D(m)        _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
-#define simdSet1D            _mm512_set1_pd
-#define simdStoreD           _mm512_store_pd
-#define simdLoadUD           simdLoadUD_mic
-#define simdStoreUD          simdStoreUD_mic
-#define simdSetZeroD         _mm512_setzero_pd
-#define simdAddD             _mm512_add_pd
-#define simdSubD             _mm512_sub_pd
-#define simdMulD             _mm512_mul_pd
-#define simdFmaddD           _mm512_fmadd_pd
-#define simdFmsubD           _mm512_fmsub_pd
-#define simdFnmaddD          _mm512_fnmadd_pd
-#define simdFnmsubD          _mm512_fnmsub_pd
-#define simdAndD(a, b)       _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdAndNotD(a, b)    _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdOrD(a, b)        _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdXorD(a, b)       _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdRsqrtD(x)        _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x)))
-#define simdRcpD(x)          _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x)))
-#define simdAbsD(x)         simdAndNotD(_mm512_set1_pd(GMX_DOUBLE_NEGZERO), x)
-#define simdNegD(x)         _mm512_addn_pd(x, _mm512_setzero_pd())
-#define simdMaxD             _mm512_gmax_pd
-#define simdMinD             _mm512_gmin_pd
-#define simdRoundD(a)        _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simdTruncD(a)        _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simdFractionD(x)     _mm512_sub_pd(x, simdTruncD(x))
-#define simdGetExponentD(x) _mm512_getexp_pd(x)
-#define simdGetMantissaD(x) _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
-#define simdSetExponentD(x) simdSetExponentD_mic(x)
-/* integer datatype corresponding to float: SimdFInt32
-   Doesn't use mask other than where required. No side effect expected for operating on the (unused) upper 8.
- */
-#define SimdDInt32          __m512i
-#define simdLoadDI(m)        _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
-#define simdSet1DI           _mm512_set1_epi32
-#define simdStoreDI          simdStoreDI_mic
-#define simdLoadUDI          simdLoadUDI_mic
-#define simdStoreUDI         simdStoreUDI_mic
-#define simdExtractDI        simdExtractDI_mic
-#define simdSetZeroDI        _mm512_setzero_epi32
-#define simdCvtD2I(a)        _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT)
-#define simdCvttD2I(a)       _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_ZERO)
-#define simdCvtI2D           _mm512_cvtepi32lo_pd
-/* Integer logical ops on SimdFInt32 */
-#define simdSlliDI           _mm512_slli_epi32
-#define simdSrliDI           _mm512_srli_epi32
-#define simdAndDI            _mm512_and_epi32
-#define simdAndNotDI         _mm512_andnot_epi32
-#define simdOrDI             _mm512_or_epi32
-#define simdXorDI            _mm512_xor_epi32
-/* Integer arithmetic ops on SimdFInt32 */
-#define simdAddDI            _mm512_add_epi32
-#define simdSubDI            _mm512_sub_epi32
-#define simdMulDI            _mm512_mullo_epi32
-/* Boolean & comparison operations on SimdFloat */
-#define SimdDBool           __mmask8
-#define simdCmpEqD(a, b)     _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)
-#define simdCmpLtD(a, b)     _mm512_cmp_pd_mask(a, b, _CMP_LT_OS)
-#define simdCmpLeD(a, b)     _mm512_cmp_pd_mask(a, b, _CMP_LE_OS)
-#define simdAndDB            _mm512_kand
-#define simdOrDB             _mm512_kor
-#define simdAnyTrueDB(x)     _mm512_mask2int(x)
-#define simdMaskD(a, sel)    _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
-#define simdMaskNotD(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
-#define simdBlendD(a, b, sel)    _mm512_mask_blend_pd(sel, a, b)
-#define simdReduceD(a)       _mm512_reduce_add_pd(a)
-/* Boolean & comparison operations on SimdFInt32 */
-#define SimdDIBool          __mmask16
-#define simdCmpEqDI(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
-#define simdCmpLtDI(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
-#define simdAndDIB           _mm512_kand
-#define simdOrDIB            _mm512_kor
-#define simdAnyTrueDIB(x)    (_mm512_mask2int(x)&0xFF)
-#define simdMaskDI(a, sel)    _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
-#define simdMaskNotDI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
-#define simdBlendDI(a, b, sel)    _mm512_mask_blend_epi32(sel, a, b)
-/* Conversions between booleans. Double & dint stuff is stored in low bits */
-#define simdCvtDB2DIB(x)     (x)
-#define simdCvtDIB2DB(x)     (x)
-
-/* Float/double conversion */
-#define simdCvtF2DD          simdCvtF2DD_mic
-#define simdCvtDD2F          simdCvtDD2F_mic
-
-
-#define PERM_LOW2HIGH _MM_PERM_BABA
-#define PERM_HIGH2LOW _MM_PERM_DCDC
-
-#define mask_loh _mm512_int2mask(0x00FF) /* would be better a constant - but can't initialize with a function call. */
-#define mask_hih _mm512_int2mask(0xFF00)
-
-/* load store double */
-static inline __m512d gmx_simdcall
-simdLoadUD_mic(const double * m)
-{
-    return _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8);
-}
-
-static inline void gmx_simdcall
-simdStoreUD_mic(double * m, __m512d s)
-{
-    _mm512_packstorelo_pd(m, s);
-    _mm512_packstorehi_pd(m+8, s);
-}
-
-/* load store dint32 */
-static inline void gmx_simdcall
-simdStoreDI_mic(std::int32_t * m, __m512i s)
-{
-    assert((size_t)m%32 == 0);
-    _mm512_mask_packstorelo_epi32(m, mask_loh, s);
-}
-
-static inline __m512i gmx_simdcall
-simdLoadUDI_mic(const std::int32_t * m)
-{
-    return _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m), mask_loh, m+16);
-}
-
-static inline void gmx_simdcall
-simdStoreUDI_mic(std::int32_t * m, __m512i s)
-{
-    _mm512_mask_packstorelo_epi32(m, mask_loh, s);
-    _mm512_mask_packstorehi_epi32(m+16, mask_loh, s);
-}
-
-static inline std::int32_t gmx_simdcall
-simdExtractDI_mic(SimdDInt32 a, int index)
-{
-    int r;
-    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
-    return r;
-}
-
-/* This is likely faster than the built in scale operation (lat 8, t-put 3)
- * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
- */
-static inline __m512d gmx_simdcall
-simdSetExponentD_mic(__m512d a)
-{
-    const __m512i expbias      = _mm512_set1_epi32(1023);
-    __m512i       iexp         = _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT);
-    iexp = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), iexp);
-    iexp = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iexp, expbias), 20);
-    return _mm512_castsi512_pd(iexp);
-}
-
-static inline void gmx_simdcall
-simdCvtF2DD_mic(__m512 f, __m512d * d0, __m512d * d1)
-{
-    __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f), PERM_HIGH2LOW);
-
-    *d0 = _mm512_cvtpslo_pd(f);
-    *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
-}
-
-static inline __m512 gmx_simdcall
-simdCvtDD2F_mic(__m512d d0, __m512d d1)
-{
-    __m512 f0 = _mm512_cvtpd_pslo(d0);
-    __m512 f1 = _mm512_cvtpd_pslo(d1);
-    return _mm512_mask_permute4f128_ps(f0, mask_hih, f1, PERM_LOW2HIGH);
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD_DOUBLE_H */
diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd_float.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd_float.h
deleted file mode 100644 (file)
index c2b0815..0000000
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
-
-#include "config.h"
-
-#include <cmath>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-
-/****************************************************
- *      SINGLE PRECISION SIMD IMPLEMENTATION        *
- ****************************************************/
-#define SimdFloat           __m512
-#define simdLoadF            _mm512_load_ps
-#define simdLoad1F(m)        _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
-#define simdSet1F            _mm512_set1_ps
-#define simdStoreF           _mm512_store_ps
-#define simdLoadUF           simdLoadUF_mic
-#define simdStoreUF          simdStoreUF_mic
-#define simdSetZeroF         _mm512_setzero_ps
-#define simdAddF             _mm512_add_ps
-#define simdSubF             _mm512_sub_ps
-#define simdMulF             _mm512_mul_ps
-#define simdFmaddF           _mm512_fmadd_ps
-#define simdFmsubF           _mm512_fmsub_ps
-#define simdFnmaddF          _mm512_fnmadd_ps
-#define simdFnmsubF          _mm512_fnmsub_ps
-#define simdAndF(a, b)        _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdAndNotF(a, b)     _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdOrF(a, b)         _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdXorF(a, b)        _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdRsqrtF           _mm512_rsqrt23_ps
-#define simdRcpF             _mm512_rcp23_ps
-#define simdAbsF(x)         simdAndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
-#define simdNegF(x)         _mm512_addn_ps(x, _mm512_setzero_ps())
-#define simdMaxF             _mm512_gmax_ps
-#define simdMinF             _mm512_gmin_ps
-#define simdRoundF(x)        _mm512_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simdTruncF(x)        _mm512_round_ps(x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simdFractionF(x)     _mm512_sub_ps(x, simdTruncF(x))
-#define simdGetExponentF(x) _mm512_getexp_ps(x)
-#define simdGetMantissaF(x) _mm512_getmant_ps(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
-#define simdSetExponentF(x) simdSetExponentF_mic(x)
-/* integer datatype corresponding to float: SimdFInt32 */
-#define SimdFInt32          __m512i
-#define simdLoadFI           _mm512_load_epi32
-#define simdSet1FI           _mm512_set1_epi32
-#define simdStoreFI          _mm512_store_epi32
-#define simdLoadUFI          simdLoadUFI_mic
-#define simdStoreUFI         simdStoreUFI_mic
-#define simdExtractFI        simdExtractFI_mic
-#define simdSetZeroFI        _mm512_setzero_epi32
-#define simdCvtF2I(a)        _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simdCvttF2I(a)       _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simdCvtI2F(a)        _mm512_cvtfxpnt_round_adjustepi32_ps(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-/* Integer logical ops on SimdFInt32 */
-#define simdSlliFI           _mm512_slli_epi32
-#define simdSrliFI           _mm512_srli_epi32
-#define simdAndFI            _mm512_and_epi32
-#define simdAndNotFI         _mm512_andnot_epi32
-#define simdOrFI             _mm512_or_epi32
-#define simdXorFI            _mm512_xor_epi32
-/* Integer arithmetic ops on SimdFInt32 */
-#define simdAddFI            _mm512_add_epi32
-#define simdSubFI            _mm512_sub_epi32
-#define simdMulFI            _mm512_mullo_epi32
-/* Boolean & comparison operations on SimdFloat */
-#define SimdFBool           __mmask16
-#define simdCmpEqF(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
-#define simdCmpLtF(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
-#define simdCmpLeF(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
-#define simdAndFB            _mm512_kand
-#define simdAndNotFB(a, b)   _mm512_knot(_mm512_kor(a, b))
-#define simdOrFB             _mm512_kor
-#define simdAnyTrueFB        _mm512_mask2int
-#define simdMaskF(a, sel)    _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
-#define simdMaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
-#define simdBlendF(a, b, sel)    _mm512_mask_blend_ps(sel, a, b)
-#define simdReduceF(a)       _mm512_reduce_add_ps(a)
-/* Boolean & comparison operations on SimdFInt32 */
-#define SimdFIBool          __mmask16
-#define simdCmpEqFI(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
-#define simdCmpLtFI(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
-#define simdAndFIB           _mm512_kand
-#define simdOrFIB            _mm512_kor
-#define simdAnyTrueFIB       _mm512_mask2int
-#define simdMaskFI(a, sel)    _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
-#define simdMaskNotFI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
-#define simdBlendFI(a, b, sel)    _mm512_mask_blend_epi32(sel, a, b)
-/* Conversions between different booleans */
-#define simdCvtFB2FIB(x)     (x)
-#define simdCvtFIB2FB(x)     (x)
-
-/* MIC provides full single precision of some neat functions: */
-/* 1/sqrt(x) and 1/x work fine in simd_math.h, and won't use extra iterations */
-#define simdExp2F            simdExp2F_mic
-#define simdExpF             simdExpF_mic
-#define simdLogF             simdLogF_mic
-
-/* load store float */
-static inline __m512 gmx_simdcall
-simdLoadUF_mic(const float * m)
-{
-    return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
-}
-
-static inline void gmx_simdcall
-simdStoreUF_mic(float * m, __m512 s)
-{
-    _mm512_packstorelo_ps(m, s);
-    _mm512_packstorehi_ps(m+16, s);
-}
-
-/* load store fint32 */
-static inline __m512i gmx_simdcall
-simdLoadUFI_mic(const std::int32_t * m)
-{
-    return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
-}
-
-static inline void gmx_simdcall
-simdStoreUFI_mic(std::int32_t * m, __m512i s)
-{
-    _mm512_packstorelo_epi32(m, s);
-    _mm512_packstorehi_epi32(m+16, s);
-}
-
-/* extract */
-static inline std::int32_t gmx_simdcall
-simdExtractFI_mic(SimdFInt32 a, int index)
-{
-    int r;
-    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
-    return r;
-}
-
-/* This is likely faster than the built in scale operation (lat 8, t-put 3)
- * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
- */
-static inline __m512 gmx_simdcall
-simdSetExponentF_mic(__m512 a)
-{
-    __m512i       iexp         = simdCvtF2I(a);
-
-    const __m512i expbias      = _mm512_set1_epi32(127);
-    iexp = _mm512_slli_epi32(_mm512_add_epi32(iexp, expbias), 23);
-    return _mm512_castsi512_ps(iexp);
-
-    /* scale alternative:
-       return _mm512_scale_ps(_mm512_set1_ps(1), iexp);
-     */
-}
-
-static inline __m512 gmx_simdcall
-simdExp2F_mic(__m512 x)
-{
-    return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
-}
-
-static inline __m512 gmx_simdcall
-simdExpF_mic(__m512 x)
-{
-    const SimdFloat         argscale    = simdSet1F(1.44269504088896341f);
-    const SimdFloat         invargscale = simdSet1F(-0.69314718055994528623f);
-    __m512                  xscaled     = _mm512_mul_ps(x, argscale);
-    __m512                  r           = simdExp2F_mic(xscaled);
-
-    /* simdExp2F_mic() provides 23 bits of accuracy, but we ruin some of that
-     * with the argument scaling due to single-precision rounding, where the
-     * rounding error is amplified exponentially. To correct this, we find the
-     * difference between the scaled argument and the true one (extended precision
-     * arithmetics does not appear to be necessary to fulfill our accuracy requirements)
-     * and then multiply by the exponent of this correction since exp(a+b)=exp(a)*exp(b).
-     * Note that this only adds two instructions (and maybe some constant loads).
-     */
-    x         = simdFmaddF(invargscale, xscaled, x);
-    /* x will now be a _very_ small number, so approximate exp(x)=1+x.
-     * We should thus apply the correction as r'=r*(1+x)=r+r*x
-     */
-    r         = simdFmaddF(r, x, r);
-    return r;
-}
-
-static inline __m512 gmx_simdcall
-simdLogF_mic(__m512 x)
-{
-    return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H */
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic.h
new file mode 100644 (file)
index 0000000..5cf004b
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_H
+#define GMX_SIMD_IMPL_X86_MIC_H
+
+#include "impl_x86_mic_definitions.h"
+#include "impl_x86_mic_general.h"
+#include "impl_x86_mic_simd4_double.h"
+#include "impl_x86_mic_simd4_float.h"
+#include "impl_x86_mic_simd_double.h"
+#include "impl_x86_mic_simd_float.h"
+#include "impl_x86_mic_util_double.h"
+#include "impl_x86_mic_util_float.h"
+
+#endif // GMX_SIMD_IMPL_X86_MIC_H
similarity index 74%
rename from src/gromacs/simd/impl_intel_mic/impl_intel_mic_common.h
rename to src/gromacs/simd/impl_x86_mic/impl_x86_mic_definitions.h
index 4a72703ea0138f607ad18052659438cdebee5525..61123eb1d9e56e243124c2affeb182a2016ca7f0 100644 (file)
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_COMMON_H
-#define GMX_SIMD_IMPL_INTEL_MIC_COMMON_H
+#ifndef GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
+#define GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
 
-/* Intel Xeon Phi, or
- * the-artist-formerly-known-as-Knight's-corner, or
- * the-artist-formerly-formerly-known-as-MIC, or
- * the artist formerly-formerly-formerly-known-as-Larrabee
- * 512-bit SIMD instruction wrappers.
- */
-
-/* Capability definitions for Xeon Phi SIMD */
 #define GMX_SIMD                                   1
 #define GMX_SIMD_HAVE_FLOAT                        1
 #define GMX_SIMD_HAVE_DOUBLE                       1
 #define GMX_SIMD_HAVE_DINT32_EXTRACT               1
 #define GMX_SIMD_HAVE_DINT32_LOGICAL               1
 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS           1
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT        0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT      0
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT        0
+#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT             1
+#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT            1
+#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT             1
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE       0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE     0
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE       0
+#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE            0
+#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE           0
+#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE            0
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT    1
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE   1
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT             1
+#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE            1
+
 #define GMX_SIMD4_HAVE_FLOAT                       1
 #define GMX_SIMD4_HAVE_DOUBLE                      1
 
-/* Implementation details */
+// Implementation details
 #define GMX_SIMD_FLOAT_WIDTH                      16
 #define GMX_SIMD_DOUBLE_WIDTH                      8
 #define GMX_SIMD_FINT32_WIDTH                     16
@@ -72,4 +81,4 @@
 #define GMX_SIMD_RSQRT_BITS                       23
 #define GMX_SIMD_RCP_BITS                         23
 
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_COMMON_H */
+#endif // GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
similarity index 85%
rename from src/gromacs/simd/impl_intel_mic/impl_intel_mic.h
rename to src/gromacs/simd/impl_x86_mic/impl_x86_mic_general.h
index c4c9f5a1e0cd9c4eb2f6f4efd38fb7b33876eedb..42b9c8cfd1179af4b8365f2fedb2d4f4f51da7c3 100644 (file)
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_H
-#define GMX_SIMD_IMPL_INTEL_MIC_H
+#ifndef GMX_SIMD_IMPL_X86_MIC_GENERAL_H
+#define GMX_SIMD_IMPL_X86_MIC_GENERAL_H
 
-#include "impl_intel_mic_simd4_double.h"
-#include "impl_intel_mic_simd4_float.h"
-#include "impl_intel_mic_simd_double.h"
-#include "impl_intel_mic_simd_float.h"
+#include <immintrin.h>
 
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_H */
+namespace gmx
+{
+
+static inline void
+simdPrefetch(const void * m)
+{
+    _mm_prefetch((const char *)m, _MM_HINT_T0);
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_OTHER_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h
new file mode 100644 (file)
index 0000000..4ed2602
--- /dev/null
@@ -0,0 +1,393 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_double.h"
+
+namespace gmx
+{
+
+class Simd4Double
+{
+    public:
+        Simd4Double() {}
+
+        Simd4Double(double d) : simdInternal_(_mm512_set1_pd(d)) {}
+
+        // Internal utility constructor to simplify return statements
+        Simd4Double(__m512d simd) : simdInternal_(simd) {}
+
+        __m512d  simdInternal_;
+};
+
+class Simd4DBool
+{
+    public:
+        Simd4DBool() {}
+
+        // Internal utility constructor to simplify return statements
+        Simd4DBool(__mmask16 simd) : simdInternal_(simd) {}
+
+        __mmask16  simdInternal_;
+};
+
+static inline Simd4Double gmx_simdcall
+load4(const double *m)
+{
+    assert(size_t(m) % 32 == 0);
+    return {
+               _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
+    };
+}
+
+static inline void gmx_simdcall
+store4(double *m, Simd4Double a)
+{
+    assert(size_t(m) % 32 == 0);
+    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Double gmx_simdcall
+load4U(const double *m)
+{
+    return {
+               _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m), _mm512_int2mask(0xF), m+8)
+    };
+}
+
+static inline void gmx_simdcall
+store4U(double *m, Simd4Double a)
+{
+    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
+    _mm512_mask_packstorehi_pd(m+8, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Double gmx_simdcall
+simd4SetZeroD()
+{
+    return {
+               _mm512_setzero_pd()
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+operator&(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+                                                         _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+andNot(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+                                                            _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+operator|(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+                                                        _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+operator^(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+                                                         _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+operator+(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_add_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+operator-(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_sub_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+operator-(Simd4Double x)
+{
+    return {
+               _mm512_mask_addn_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_pd())
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+operator*(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+fma(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+    return {
+               _mm512_mask_fmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+fms(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+    return {
+               _mm512_mask_fmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+fnma(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+    return {
+               _mm512_mask_fnmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+fnms(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+    return {
+               _mm512_mask_fnmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+rsqrt(Simd4Double x)
+{
+    return {
+               _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(),
+                                      _mm512_int2mask(0xF),
+                                      _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(),
+                                                             _mm512_int2mask(0xF),
+                                                             _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(),
+                                                                                    _mm512_int2mask(0xF), x.simdInternal_)))
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+abs(Simd4Double x)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF),
+                                                            _mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)),
+                                                            _mm512_castpd_si512(x.simdInternal_)))
+
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+max(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_gmax_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+min(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_gmin_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+round(Simd4Double x)
+{
+    return {
+               _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+trunc(Simd4Double x)
+{
+    return {
+               _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline float gmx_simdcall
+dotProduct(Simd4Double a, Simd4Double b)
+{
+    return _mm512_mask_reduce_add_pd(_mm512_int2mask(7),
+                                     _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7),
+                                                        a.simdInternal_, b.simdInternal_));
+}
+
+static inline void gmx_simdcall
+transpose(Simd4Double * v0, Simd4Double * v1,
+          Simd4Double * v2, Simd4Double * v3)
+{
+    __m512i t0 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v0->simdInternal_), 0xFF00,
+                                                _mm512_castpd_si512(v1->simdInternal_), _MM_PERM_BABA);
+    __m512i t1 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v2->simdInternal_), 0xFF00,
+                                                _mm512_castpd_si512(v3->simdInternal_), _MM_PERM_BABA);
+
+    t0 = _mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t0);
+    t1 = _mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t1);
+
+    v0->simdInternal_ = _mm512_mask_swizzle_pd(_mm512_castsi512_pd(t0), _mm512_int2mask(0xCC),
+                                               _mm512_castsi512_pd(t1), _MM_SWIZ_REG_BADC);
+    v1->simdInternal_ = _mm512_mask_swizzle_pd(_mm512_castsi512_pd(t1), _mm512_int2mask(0x33),
+                                               _mm512_castsi512_pd(t0), _MM_SWIZ_REG_BADC);
+
+    v2->simdInternal_ = _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v0->simdInternal_), _MM_PERM_DCDC));
+    v3->simdInternal_ = _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v1->simdInternal_), _MM_PERM_DCDC));
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline Simd4DBool gmx_simdcall
+operator==(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+    };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator!=(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+    };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator<(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+    };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator<=(Simd4Double a, Simd4Double b)
+{
+    return {
+               _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+    };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator&&(Simd4DBool a, Simd4DBool b)
+{
+    return {
+               _mm512_kand(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator||(Simd4DBool a, Simd4DBool b)
+{
+    return {
+               _mm512_kor(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(Simd4DBool a)
+{
+    return (_mm512_mask2int(a.simdInternal_) & 0xF) != 0;
+}
+
+static inline Simd4Double gmx_simdcall
+selectByMask(Simd4Double a, Simd4DBool m)
+{
+    return {
+               _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+selectByNotMask(Simd4Double a, Simd4DBool m)
+{
+    return {
+               _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(m.simdInternal_), a.simdInternal_)
+    };
+}
+
+static inline Simd4Double gmx_simdcall
+blend(Simd4Double a, Simd4Double b, Simd4DBool sel)
+{
+    return {
+               _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline float gmx_simdcall
+reduce(Simd4Double a)
+{
+    return _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), a.simdInternal_);
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_float.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_float.h
new file mode 100644 (file)
index 0000000..2b72a39
--- /dev/null
@@ -0,0 +1,390 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_float.h"
+
+namespace gmx
+{
+
+class Simd4Float
+{
+    public:
+        Simd4Float() {}
+
+        Simd4Float(float f) : simdInternal_(_mm512_set1_ps(f)) {}
+
+        // Internal utility constructor to simplify return statements
+        Simd4Float(__m512 simd) : simdInternal_(simd) {}
+
+        __m512  simdInternal_;
+};
+
+class Simd4FBool
+{
+    public:
+        Simd4FBool() {}
+
+        // Internal utility constructor to simplify return statements
+        Simd4FBool(__mmask16 simd) : simdInternal_(simd) {}
+
+        __mmask16  simdInternal_;
+};
+
+static inline Simd4Float gmx_simdcall
+load4(const float *m)
+{
+    assert(size_t(m) % 16 == 0);
+    return {
+               _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
+    };
+}
+
+static inline void gmx_simdcall
+store4(float *m, Simd4Float a)
+{
+    assert(size_t(m) % 16 == 0);
+    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Float gmx_simdcall
+load4U(const float *m)
+{
+    return {
+               _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m), _mm512_int2mask(0xF), m+16)
+    };
+}
+
+static inline void gmx_simdcall
+store4U(float *m, Simd4Float a)
+{
+    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
+    _mm512_mask_packstorehi_ps(m+16, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Float gmx_simdcall
+simd4SetZeroF()
+{
+    return {
+               _mm512_setzero_ps()
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator&(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+                                                         _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+andNot(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+                                                            _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator|(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+                                                        _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator^(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+                                                         _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator+(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_add_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator-(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_sub_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator-(Simd4Float x)
+{
+    return {
+               _mm512_mask_addn_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_ps())
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator*(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               _mm512_mask_fmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               _mm512_mask_fmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fnma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               _mm512_mask_fnmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fnms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               _mm512_mask_fnmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+rsqrt(Simd4Float x)
+{
+    return {
+               _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+abs(Simd4Float x)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+                                                            _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
+                                                            _mm512_castps_si512(x.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+max(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_gmax_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+min(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_gmin_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+round(Simd4Float x)
+{
+    return {
+               _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF),
+                                    x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+trunc(Simd4Float x)
+{
+    return {
+               _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF),
+                                    x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline float gmx_simdcall
+dotProduct(Simd4Float a, Simd4Float b)
+{
+    __m512 x = _mm512_mask_mul_ps(_mm512_setzero_ps(), _mm512_int2mask(0x7), a.simdInternal_, b.simdInternal_);
+    x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
+    x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
+    float f;
+    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
+    return f;
+}
+
+static inline void gmx_simdcall
+transpose(Simd4Float * v0, Simd4Float * v1,
+          Simd4Float * v2, Simd4Float * v3)
+{
+    v0->simdInternal_ = _mm512_mask_permute4f128_ps(v0->simdInternal_, _mm512_int2mask(0x00F0), v1->simdInternal_, _MM_PERM_AAAA);
+    v2->simdInternal_ = _mm512_mask_permute4f128_ps(v2->simdInternal_, _mm512_int2mask(0x00F0), v3->simdInternal_, _MM_PERM_AAAA);
+    v0->simdInternal_ = _mm512_mask_permute4f128_ps(v0->simdInternal_, _mm512_int2mask(0xFF00), v2->simdInternal_, _MM_PERM_BABA);
+    v0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0),
+                                                                    _mm512_castps_si512(v0->simdInternal_)));
+    v1->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_BBBB);
+    v2->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_CCCC);
+    v3->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_DDDD);
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline Simd4FBool gmx_simdcall
+operator==(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator!=(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator<(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator<=(Simd4Float a, Simd4Float b)
+{
+    return {
+               _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator&&(Simd4FBool a, Simd4FBool b)
+{
+    return {
+               _mm512_kand(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator||(Simd4FBool a, Simd4FBool b)
+{
+    return {
+               _mm512_kor(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(Simd4FBool a)
+{
+    return ( _mm512_mask2int(a.simdInternal_) & 0xF) != 0;
+}
+
+static inline Simd4Float gmx_simdcall
+selectByMask(Simd4Float a, Simd4FBool m)
+{
+    return {
+               _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+selectByNotMask(Simd4Float a, Simd4FBool m)
+{
+    return {
+               _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(m.simdInternal_), a.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
+{
+    return {
+               _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline float gmx_simdcall
+reduce(Simd4Float a)
+{
+    __m512 x = a.simdInternal_;
+    x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
+    x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
+    float f;
+    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
+    return f;
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h
new file mode 100644 (file)
index 0000000..c0868ee
--- /dev/null
@@ -0,0 +1,723 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_float.h"
+
+namespace gmx
+{
+
+class SimdDouble
+{
+    public:
+        SimdDouble() {}
+
+        SimdDouble(double d) : simdInternal_(_mm512_set1_pd(d)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDouble(__m512d simd) : simdInternal_(simd) {}
+
+        __m512d  simdInternal_;
+};
+
+class SimdDInt32
+{
+    public:
+        SimdDInt32() {}
+
+        SimdDInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDInt32(__m512i simd) : simdInternal_(simd) {}
+
+        __m512i  simdInternal_;
+};
+
+class SimdDBool
+{
+    public:
+        SimdDBool() {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDBool(__mmask8 simd) : simdInternal_(simd) {}
+
+        __mmask8  simdInternal_;
+};
+
+class SimdDIBool
+{
+    public:
+        SimdDIBool() {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDIBool(__mmask16 simd) : simdInternal_(simd) {}
+
+        __mmask16  simdInternal_;
+};
+
+static inline SimdDouble gmx_simdcall
+load(const double *m)
+{
+    assert(std::size_t(m) % 64 == 0);
+    return {
+               _mm512_load_pd(m)
+    };
+}
+
+static inline void gmx_simdcall
+store(double *m, SimdDouble a)
+{
+    assert(std::size_t(m) % 64 == 0);
+    _mm512_store_pd(m, a.simdInternal_);
+}
+
+static inline SimdDouble gmx_simdcall
+loadU(const double *m)
+{
+    return {
+               _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(double *m, SimdDouble a)
+{
+    _mm512_packstorelo_pd(m, a.simdInternal_);
+    _mm512_packstorehi_pd(m+8, a.simdInternal_);
+
+}
+
+static inline SimdDouble gmx_simdcall
+setZeroD()
+{
+    return {
+               _mm512_setzero_pd()
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+loadDI(const std::int32_t * m)
+{
+    assert(std::size_t(m) % 32 == 0);
+    return {
+               _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
+    };
+}
+
+static inline void gmx_simdcall
+store(std::int32_t * m, SimdDInt32 a)
+{
+    assert(std::size_t(m) % 32 == 0);
+    _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
+}
+
+static inline SimdDInt32 gmx_simdcall
+loadUDI(const std::int32_t *m)
+{
+    return {
+               _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), m),
+                                              _mm512_int2mask(0x00FF), m+16)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(std::int32_t * m, SimdDInt32 a)
+{
+    _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
+    _mm512_mask_packstorehi_epi32(m+16, _mm512_int2mask(0x00FF), a.simdInternal_);
+}
+
+static inline SimdDInt32 gmx_simdcall
+setZeroDI()
+{
+    return {
+               _mm512_setzero_epi32()
+    };
+}
+
+template<int index>
+static inline std::int32_t gmx_simdcall
+extract(SimdDInt32 a)
+{
+    int r;
+    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a.simdInternal_);
+    return r;
+}
+
+static inline SimdDouble gmx_simdcall
+operator&(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+andNot(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator|(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator^(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator+(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_add_pd(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator-(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_sub_pd(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator-(SimdDouble x)
+{
+    return {
+               _mm512_addn_pd(x.simdInternal_, _mm512_setzero_pd())
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator*(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_mul_pd(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fma(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fms(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               _mm512_fmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fnma(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               _mm512_fnmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fnms(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               _mm512_fnmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+rsqrt(SimdDouble x)
+{
+    return {
+               _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+rcp(SimdDouble x)
+{
+    return {
+               _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
+{
+    return {
+               _mm512_mask_add_pd(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
+{
+    return {
+               _mm512_mask_mul_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
+{
+    return {
+               _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzRsqrt(SimdDouble x, SimdDBool m)
+{
+    return {
+               _mm512_cvtpslo_pd(_mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzRcp(SimdDouble x, SimdDBool m)
+{
+    return {
+               _mm512_cvtpslo_pd(_mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+abs(SimdDouble x)
+{
+    return {
+               _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)), _mm512_castpd_si512(x.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+max(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_gmax_pd(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+min(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_gmin_pd(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+round(SimdDouble x)
+{
+    return {
+               _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+trunc(SimdDouble x)
+{
+    return {
+               _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline SimdDouble
+frexp(SimdDouble value, SimdDInt32 * exponent)
+{
+    __m512d rExponent = _mm512_getexp_pd(value.simdInternal_);
+    __m512i iExponent = _mm512_cvtfxpnt_roundpd_epi32lo(rExponent, _MM_FROUND_TO_NEAREST_INT);
+
+    exponent->simdInternal_ = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
+
+    return {
+               _mm512_getmant_pd(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src)
+    };
+}
+
+static inline SimdDouble
+ldexp(SimdDouble value, SimdDInt32 exponent)
+{
+    const __m512i exponentBias = _mm512_set1_epi32(1023);
+    __m512i       iExponent;
+
+    iExponent = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), exponent.simdInternal_);
+    iExponent = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iExponent, exponentBias), 20);
+    return _mm512_mul_pd(_mm512_castsi512_pd(iExponent), value.simdInternal_);
+}
+
+static inline double gmx_simdcall
+reduce(SimdDouble a)
+{
+    return _mm512_reduce_add_pd(a.simdInternal_);
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline SimdDBool gmx_simdcall
+operator==(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator!=(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator<(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator<=(SimdDouble a, SimdDouble b)
+{
+    return {
+               _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+testBits(SimdDouble a)
+{
+    // This is a bit problematic since Knight's corner does not have any 64-bit integer comparisons,
+    // and we cannot use floating-point since values with just a single bit set can evaluate to 0.0.
+    // Instead, we do it as
+    // 1) Do a logical or of the high/low 32 bits
+    // 2) Do a permute so we have the low 32 bits of each value in the low 8 32-bit elements
+    // 3) Do an integer comparison, and cast so we just keep the low 8 bits of the mask.
+    //
+    // By default we will use integers for the masks in the nonbonded kernels, so this shouldn't
+    // have any significant performance drawbacks.
+
+    __m512i ia = _mm512_castpd_si512(a.simdInternal_);
+
+    ia = _mm512_or_epi32(ia, _mm512_swizzle_epi32(ia, _MM_SWIZ_REG_CDAB));
+    ia = _mm512_permutevar_epi32( _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), ia);
+
+    return {
+               static_cast<__mmask8>(_mm512_cmp_epi32_mask(ia, _mm512_setzero_si512(), _MM_CMPINT_NE))
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator&&(SimdDBool a, SimdDBool b)
+{
+    return {
+               static_cast<__mmask8>(_mm512_kand(a.simdInternal_, b.simdInternal_))
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator||(SimdDBool a, SimdDBool b)
+{
+    return {
+               static_cast<__mmask8>(_mm512_kor(a.simdInternal_, b.simdInternal_))
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdDBool a)
+{
+    return _mm512_mask2int(a.simdInternal_) != 0;
+}
+
+static inline SimdDouble gmx_simdcall
+selectByMask(SimdDouble a, SimdDBool m)
+{
+    return {
+               _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+selectByNotMask(SimdDouble a, SimdDBool m)
+{
+    return {
+               _mm512_mask_mov_pd(a.simdInternal_, m.simdInternal_, _mm512_setzero_pd())
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+blend(SimdDouble a, SimdDouble b, SimdDBool sel)
+{
+    return {
+               _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator<<(SimdDInt32 a, int n)
+{
+    return {
+               _mm512_slli_epi32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator>>(SimdDInt32 a, int n)
+{
+    return {
+               _mm512_srli_epi32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator&(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_and_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+andNot(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator|(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_or_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator^(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_xor_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator+(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_add_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator-(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_sub_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator*(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator==(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+testBits(SimdDInt32 a)
+{
+    return {
+               _mm512_cmp_epi32_mask(a.simdInternal_, _mm512_setzero_si512(), _MM_CMPINT_NE)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator<(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+               _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator&&(SimdDIBool a, SimdDIBool b)
+{
+    return {
+               _mm512_kand(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator||(SimdDIBool a, SimdDIBool b)
+{
+    return {
+               _mm512_kor(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdDIBool a)
+{
+    return ( _mm512_mask2int(a.simdInternal_) & 0xFF) != 0;
+}
+
+static inline SimdDInt32 gmx_simdcall
+selectByMask(SimdDInt32 a, SimdDIBool m)
+{
+    return {
+               _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+selectByNotMask(SimdDInt32 a, SimdDIBool m)
+{
+    return {
+               _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32())
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
+{
+    return {
+               _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+cvtR2I(SimdDouble a)
+{
+    return {
+               _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+cvttR2I(SimdDouble a)
+{
+    return {
+               _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_ZERO)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+cvtI2R(SimdDInt32 a)
+{
+    return {
+               _mm512_cvtepi32lo_pd(a.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+cvtB2IB(SimdDBool a)
+{
+    return {
+               a.simdInternal_
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+cvtIB2B(SimdDIBool a)
+{
+    return {
+               static_cast<__mmask8>(a.simdInternal_)
+    };
+}
+
+static inline void gmx_simdcall
+cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
+{
+    __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f.simdInternal_), _MM_PERM_DCDC);
+
+    *d0 = _mm512_cvtpslo_pd(f.simdInternal_);
+    *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
+}
+
+static inline SimdFloat gmx_simdcall
+cvtDD2F(SimdDouble d0, SimdDouble d1)
+{
+    __m512 f0 = _mm512_cvtpd_pslo(d0.simdInternal_);
+    __m512 f1 = _mm512_cvtpd_pslo(d1.simdInternal_);
+    return {
+               _mm512_mask_permute4f128_ps(f0, _mm512_int2mask(0xFF00), f1, _MM_PERM_BABA)
+    };
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_float.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_float.h
new file mode 100644 (file)
index 0000000..3497b82
--- /dev/null
@@ -0,0 +1,732 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+namespace gmx
+{
+
+class SimdFloat
+{
+    public:
+        SimdFloat() {}
+
+        SimdFloat(float f) : simdInternal_(_mm512_set1_ps(f)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFloat(__m512 simd) : simdInternal_(simd) {}
+
+        __m512  simdInternal_;
+};
+
+class SimdFInt32
+{
+    public:
+        SimdFInt32() {}
+
+        SimdFInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFInt32(__m512i simd) : simdInternal_(simd) {}
+
+        __m512i  simdInternal_;
+};
+
+class SimdFBool
+{
+    public:
+        SimdFBool() {}
+
+        SimdFBool(bool b) : simdInternal_(_mm512_int2mask( b ? 0xFFFF : 0)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFBool(__mmask16 simd) : simdInternal_(simd) {}
+
+        __mmask16  simdInternal_;
+};
+
+class SimdFIBool
+{
+    public:
+        SimdFIBool() {}
+
+        SimdFIBool(bool b) : simdInternal_(_mm512_int2mask( b ? 0xFFFF : 0)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFIBool(__mmask16 simd) : simdInternal_(simd) {}
+
+        __mmask16  simdInternal_;
+};
+
+static inline SimdFloat gmx_simdcall
+load(const float *m)
+{
+    assert(std::size_t(m) % 64 == 0);
+    return {
+               _mm512_load_ps(m)
+    };
+}
+
+static inline void gmx_simdcall
+store(float *m, SimdFloat a)
+{
+    assert(std::size_t(m) % 64 == 0);
+    _mm512_store_ps(m, a.simdInternal_);
+}
+
+static inline SimdFloat gmx_simdcall
+loadU(const float *m)
+{
+    return {
+               _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(float *m, SimdFloat a)
+{
+    _mm512_packstorelo_ps(m, a.simdInternal_);
+    _mm512_packstorehi_ps(m+16, a.simdInternal_);
+}
+
+static inline SimdFloat gmx_simdcall
+setZeroF()
+{
+    return {
+               _mm512_setzero_ps()
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+loadFI(const std::int32_t * m)
+{
+    assert(std::size_t(m) % 64 == 0);
+    return {
+               _mm512_load_epi32(m)
+    };
+}
+
+static inline void gmx_simdcall
+store(std::int32_t * m, SimdFInt32 a)
+{
+    assert(std::size_t(m) % 64 == 0);
+    _mm512_store_epi32(m, a.simdInternal_);
+}
+
+static inline SimdFInt32 gmx_simdcall
+loadUFI(const std::int32_t *m)
+{
+    return {
+               _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(std::int32_t * m, SimdFInt32 a)
+{
+    _mm512_packstorelo_epi32(m, a.simdInternal_);
+    _mm512_packstorehi_epi32(m+16, a.simdInternal_);
+}
+
+static inline SimdFInt32 gmx_simdcall
+setZeroFI()
+{
+    return {
+               _mm512_setzero_si512()
+    };
+}
+
+
+template<int index>
+static inline std::int32_t gmx_simdcall
+extract(SimdFInt32 a)
+{
+    int r;
+    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a.simdInternal_);
+    return r;
+}
+
+static inline SimdFloat gmx_simdcall
+operator&(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+andNot(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator|(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator^(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator+(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_add_ps(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator-(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_sub_ps(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator-(SimdFloat x)
+{
+    return {
+               _mm512_addn_ps(x.simdInternal_, _mm512_setzero_ps())
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator*(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_mul_ps(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+fma(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+               _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+fms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+               _mm512_fmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+fnma(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+               _mm512_fnmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+fnms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+               _mm512_fnmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+rsqrt(SimdFloat x)
+{
+    return {
+               _mm512_rsqrt23_ps(x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+rcp(SimdFloat x)
+{
+    return {
+               _mm512_rcp23_ps(x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
+{
+    return {
+               _mm512_mask_add_ps(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
+{
+    return {
+               _mm512_mask_mul_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
+{
+    return {
+               _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzRsqrt(SimdFloat x, SimdFBool m)
+{
+    return {
+               _mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzRcp(SimdFloat x, SimdFBool m)
+{
+    return {
+               _mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+abs(SimdFloat x)
+{
+    return {
+               _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)), _mm512_castps_si512(x.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+max(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_gmax_ps(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+min(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_gmin_ps(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+round(SimdFloat x)
+{
+    return {
+               _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+trunc(SimdFloat x)
+{
+    return {
+               _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+frexp(SimdFloat value, SimdFInt32 * exponent)
+{
+    __m512  rExponent = _mm512_getexp_ps(value.simdInternal_);
+    __m512i iExponent =  _mm512_cvtfxpnt_round_adjustps_epi32(rExponent, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
+
+    exponent->simdInternal_ = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
+
+    return {
+               _mm512_getmant_ps(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+ldexp(SimdFloat value, SimdFInt32 exponent)
+{
+    const __m512i exponentBias = _mm512_set1_epi32(127);
+    __m512i       iExponent;
+
+    iExponent = _mm512_slli_epi32( _mm512_add_epi32(exponent.simdInternal_, exponentBias), 23);
+
+    return {
+               _mm512_mul_ps(value.simdInternal_, _mm512_castsi512_ps(iExponent))
+    };
+}
+
+static inline float gmx_simdcall
+reduce(SimdFloat a)
+{
+    return _mm512_reduce_add_ps(a.simdInternal_);
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline SimdFBool gmx_simdcall
+operator==(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator!=(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator<(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator<=(SimdFloat a, SimdFloat b)
+{
+    return {
+               _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+testBits(SimdFloat a)
+{
+    return {
+               _mm512_test_epi32_mask( _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(a.simdInternal_) )
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator&&(SimdFBool a, SimdFBool b)
+{
+    return {
+               _mm512_kand(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator||(SimdFBool a, SimdFBool b)
+{
+    return {
+               _mm512_kor(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdFBool a)
+{
+    return _mm512_mask2int(a.simdInternal_) != 0;
+}
+
+static inline SimdFloat gmx_simdcall
+selectByMask(SimdFloat a, SimdFBool m)
+{
+    return {
+               _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+selectByNotMask(SimdFloat a, SimdFBool m)
+{
+    return {
+               _mm512_mask_mov_ps(a.simdInternal_, m.simdInternal_, _mm512_setzero_ps())
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+blend(SimdFloat a, SimdFloat b, SimdFBool sel)
+{
+    return {
+               _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator<<(SimdFInt32 a, int n)
+{
+    return {
+               _mm512_slli_epi32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator>>(SimdFInt32 a, int n)
+{
+    return {
+               _mm512_srli_epi32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator&(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_and_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+andNot(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator|(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_or_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator^(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_xor_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator+(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_add_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator-(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_sub_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator*(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator==(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+testBits(SimdFInt32 a)
+{
+    return {
+               _mm512_test_epi32_mask( a.simdInternal_, a.simdInternal_ )
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator<(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator&&(SimdFIBool a, SimdFIBool b)
+{
+    return {
+               _mm512_kand(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator||(SimdFIBool a, SimdFIBool b)
+{
+    return {
+               _mm512_kor(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdFIBool a)
+{
+    return _mm512_mask2int(a.simdInternal_) != 0;
+}
+
+static inline SimdFInt32 gmx_simdcall
+selectByMask(SimdFInt32 a, SimdFIBool m)
+{
+    return {
+               _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+selectByNotMask(SimdFInt32 a, SimdFIBool m)
+{
+    return {
+               _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32())
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
+{
+    return {
+               _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+cvtR2I(SimdFloat a)
+{
+    return {
+               _mm512_cvtfxpnt_round_adjustps_epi32(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+cvttR2I(SimdFloat a)
+{
+    return {
+               _mm512_cvtfxpnt_round_adjustps_epi32(a.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+cvtI2R(SimdFInt32 a)
+{
+    return {
+               _mm512_cvtfxpnt_round_adjustepi32_ps(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+cvtB2IB(SimdFBool a)
+{
+    return {
+               a.simdInternal_
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+cvtIB2B(SimdFIBool a)
+{
+    return {
+               a.simdInternal_
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+exp2(SimdFloat x)
+{
+    return {
+               _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x.simdInternal_, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+exp(SimdFloat x)
+{
+    const __m512     argscale    = _mm512_set1_ps(1.44269504088896341f);
+    const __m512     invargscale = _mm512_set1_ps(-0.69314718055994528623f);
+
+    __m512           xscaled     = _mm512_mul_ps(x.simdInternal_, argscale);
+    __m512           r           = _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(xscaled, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
+
+    // exp2a23_ps provides 23 bits of accuracy, but we ruin some of that with our argument
+    // scaling. To correct this, we find the difference between the scaled argument and
+    // the true one (extended precision arithmetics does not appear to be necessary to
+    // fulfill our accuracy requirements) and then multiply by the exponent of this
+    // correction since exp(a+b)=exp(a)*exp(b).
+    // Note that this only adds two instructions (and maybe some constant loads).
+
+    // find the difference
+    x         = _mm512_fmadd_ps(invargscale, xscaled, x.simdInternal_);
+    // x will now be a _very_ small number, so approximate exp(x)=1+x.
+    // We should thus apply the correction as r'=r*(1+x)=r+r*x
+    r         = _mm512_fmadd_ps(r, x.simdInternal_, r);
+    return {
+               r
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+log(SimdFloat x)
+{
+    return {
+               _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764f), _mm512_log2ae23_ps(x.simdInternal_))
+    };
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h
new file mode 100644 (file)
index 0000000..c971d27
--- /dev/null
@@ -0,0 +1,463 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
+#define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_double.h"
+
+namespace gmx
+{
+
+// On MIC it is better to use scatter operations, so we define the load routines
+// that use a SIMD offset variable first.
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const double *  base,
+                             SimdDInt32      simdoffset,
+                             SimdDouble *    v0,
+                             SimdDouble *    v1,
+                             SimdDouble *    v2,
+                             SimdDouble *    v3)
+{
+    assert((size_t)base % 32 == 0);
+    assert(align % 4 == 0);
+
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    if (align == 4)
+    {
+        simdoffset = simdoffset << 2;
+    }
+    else if (align == 8)
+    {
+        simdoffset = simdoffset << 3;
+    }
+    else
+    {
+        simdoffset = simdoffset * SimdDInt32(align);
+    }
+
+    v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
+    v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
+    v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
+    v3->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+3, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUBySimdIntTranspose(const double *  base,
+                              SimdDInt32      simdoffset,
+                              SimdDouble *    v0,
+                              SimdDouble *    v1)
+{
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    if (align == 2)
+    {
+        simdoffset = simdoffset << 1;
+    }
+    else if (align == 4)
+    {
+        simdoffset = simdoffset << 2;
+    }
+    else if (align == 8)
+    {
+        simdoffset = simdoffset << 3;
+    }
+    else
+    {
+        simdoffset = simdoffset * SimdDInt32(align);
+    }
+
+    v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
+    v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const double *    base,
+                             SimdDInt32        simdoffset,
+                             SimdDouble *      v0,
+                             SimdDouble *      v1)
+{
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 2 == 0);
+    gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
+}
+
+
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const double *        base,
+                    const std::int32_t    offset[],
+                    SimdDouble *          v0,
+                    SimdDouble *          v1,
+                    SimdDouble *          v2,
+                    SimdDouble *          v3)
+{
+    gatherLoadBySimdIntTranspose<align>(base, loadDI(offset), v0, v1, v2, v3);
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const double *        base,
+                    const std::int32_t    offset[],
+                    SimdDouble *          v0,
+                    SimdDouble *          v1)
+{
+    gatherLoadBySimdIntTranspose<align>(base, loadDI(offset), v0, v1);
+}
+
+static const int c_simdBestPairAlignmentDouble = 2;
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTranspose(const double *        base,
+                     const std::int32_t    offset[],
+                     SimdDouble *          v0,
+                     SimdDouble *          v1,
+                     SimdDouble *          v2)
+{
+    SimdDInt32 simdoffset;
+
+    assert(std::size_t(offset) % 32 == 0);
+
+    simdoffset = loadDI(offset);
+
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    if (align == 4)
+    {
+        simdoffset = simdoffset << 2;
+    }
+    else if (align == 8)
+    {
+        simdoffset = simdoffset << 3;
+    }
+    else
+    {
+        simdoffset = simdoffset * SimdDInt32(align);
+    }
+
+    v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base,   sizeof(double));
+    v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
+    v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterStoreU(double *            base,
+                       const std::int32_t  offset[],
+                       SimdDouble          v0,
+                       SimdDouble          v1,
+                       SimdDouble          v2)
+{
+    SimdDInt32 simdoffset;
+
+    assert(std::size_t(offset) % 32 == 0);
+
+    simdoffset = loadDI(offset);
+
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    if (align == 4)
+    {
+        simdoffset = simdoffset << 2;
+    }
+    else if (align == 8)
+    {
+        simdoffset = simdoffset << 3;
+    }
+    else
+    {
+        simdoffset = simdoffset * SimdDInt32(align);
+    }
+
+    _mm512_i32loscatter_pd(base,   simdoffset.simdInternal_, v0.simdInternal_, sizeof(double));
+    _mm512_i32loscatter_pd(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double));
+    _mm512_i32loscatter_pd(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterIncrU(double *            base,
+                      const std::int32_t  offset[],
+                      SimdDouble          v0,
+                      SimdDouble          v1,
+                      SimdDouble          v2)
+{
+    GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata0[GMX_SIMD_DOUBLE_WIDTH];
+    GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata1[GMX_SIMD_DOUBLE_WIDTH];
+    GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata2[GMX_SIMD_DOUBLE_WIDTH];
+
+    store(rdata0, v0);
+    store(rdata1, v1);
+    store(rdata2, v2);
+
+    for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        base[ align * offset[i] + 0] += rdata0[i];
+        base[ align * offset[i] + 1] += rdata1[i];
+        base[ align * offset[i] + 2] += rdata2[i];
+    }
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterDecrU(double *            base,
+                      const std::int32_t  offset[],
+                      SimdDouble          v0,
+                      SimdDouble          v1,
+                      SimdDouble          v2)
+{
+    GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata0[GMX_SIMD_DOUBLE_WIDTH];
+    GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata1[GMX_SIMD_DOUBLE_WIDTH];
+    GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH)  rdata2[GMX_SIMD_DOUBLE_WIDTH];
+
+    store(rdata0, v0);
+    store(rdata1, v1);
+    store(rdata2, v2);
+
+    for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        base[ align * offset[i] + 0] -= rdata0[i];
+        base[ align * offset[i] + 1] -= rdata1[i];
+        base[ align * offset[i] + 2] -= rdata2[i];
+    }
+}
+
+static inline void gmx_simdcall
+expandScalarsToTriplets(SimdDouble    scalar,
+                        SimdDouble *  triplets0,
+                        SimdDouble *  triplets1,
+                        SimdDouble *  triplets2)
+{
+    triplets0->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
+                                                                           _mm512_castpd_si512(scalar.simdInternal_)));
+    triplets1->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
+                                                                           _mm512_castpd_si512(scalar.simdInternal_)));
+    triplets2->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
+                                                                           _mm512_castpd_si512(scalar.simdInternal_)));
+}
+
+
+static inline double gmx_simdcall
+reduceIncr4ReturnSum(double *    m,
+                     SimdDouble  v0,
+                     SimdDouble  v1,
+                     SimdDouble  v2,
+                     SimdDouble  v3)
+{
+    double  d;
+    __m512d t0, t1, t2, t3;
+
+    assert(std::size_t(m) % 32 == 0);
+
+    t0 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0.simdInternal_, v2.simdInternal_), _MM_SWIZ_REG_BADC);
+    t2 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2.simdInternal_, v0.simdInternal_);
+    t1 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1.simdInternal_, v3.simdInternal_), _MM_SWIZ_REG_BADC);
+    t3 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3.simdInternal_, v1.simdInternal_);
+    t0 = _mm512_add_pd(t0, t2);
+    t1 = _mm512_add_pd(t1, t3);
+
+    t2 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0, t1), _MM_SWIZ_REG_CDAB);
+    t3 = _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1, t0);
+    t2 = _mm512_add_pd(t2, t3);
+
+    t2 = _mm512_add_pd(t2, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2), _MM_PERM_BADC)));
+
+    t0 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+    t0 = _mm512_add_pd(t0, t2);
+    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t0);
+
+    t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_BADC));
+    t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_CDAB));
+
+    _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x01), t2);
+    return d;
+}
+
+static inline SimdDouble gmx_simdcall
+loadDualHsimd(const double * m0,
+              const double * m1)
+{
+    assert(std::size_t(m0) % 32 == 0);
+    assert(std::size_t(m1) % 32 == 0);
+
+    return _mm512_mask_extload_pd(_mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
+                                  m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+}
+
+static inline SimdDouble gmx_simdcall
+loadDuplicateHsimd(const double * m)
+{
+    assert(std::size_t(m) % 32 == 0);
+
+    return _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+}
+
+static inline SimdDouble gmx_simdcall
+load1DualHsimd(const double * m)
+{
+    return _mm512_mask_extload_pd(_mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
+                                  m+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+}
+
+
+static inline void gmx_simdcall
+storeDualHsimd(double *     m0,
+               double *     m1,
+               SimdDouble   a)
+{
+    assert(std::size_t(m0) % 32 == 0);
+    assert(std::size_t(m1) % 32 == 0);
+
+    _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), a.simdInternal_);
+    _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), a.simdInternal_);
+}
+
+static inline void gmx_simdcall
+incrDualHsimd(double *     m0,
+              double *     m1,
+              SimdDouble   a)
+{
+    assert(std::size_t(m0) % 32 == 0);
+    assert(std::size_t(m1) % 32 == 0);
+
+    __m512d x;
+
+    // Update lower half
+    x = _mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+    x = _mm512_add_pd(x, a.simdInternal_);
+    _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), x);
+
+    // Update upper half
+    x = _mm512_extload_pd(m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+    x = _mm512_add_pd(x, a.simdInternal_);
+    _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
+}
+
+static inline void gmx_simdcall
+decrHsimd(double *    m,
+          SimdDouble  a)
+{
+    __m512d t;
+
+    assert(std::size_t(m) % 32 == 0);
+
+    t               = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+    a.simdInternal_ = _mm512_add_pd(a.simdInternal_, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
+    t               = _mm512_sub_pd(t, a.simdInternal_);
+    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTransposeHsimd(const double *       base0,
+                         const double *       base1,
+                         const std::int32_t   offset[],
+                         SimdDouble *         v0,
+                         SimdDouble *         v1)
+{
+    __m512i  idx0, idx1, idx;
+    __m512d  tmp1, tmp2;
+
+    assert(std::size_t(offset) % 16 == 0);
+    assert(std::size_t(base0) % 16 == 0);
+    assert(std::size_t(base1) % 16 == 0);
+    assert(std::size_t(align) % 2  == 0);
+
+    idx0 = _mm512_extload_epi32(offset, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+
+    idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
+    idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
+
+    idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0x00F0), idx1, _MM_PERM_AAAA);
+
+    tmp1 = _mm512_i32logather_pd(idx, base0, sizeof(double));
+    tmp2 = _mm512_i32logather_pd(idx, base1, sizeof(double));
+
+    v0->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp1), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2), _MM_PERM_BABA));
+    v1->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp2), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1), _MM_PERM_DCDC));
+}
+
+static inline double gmx_simdcall
+reduceIncr4ReturnSumHsimd(double *     m,
+                          SimdDouble   v0,
+                          SimdDouble   v1)
+{
+    double   d;
+    __m512d  t0, t1;
+
+    assert(std::size_t(m) % 32 == 0);
+
+    t0 = _mm512_add_pd(v0.simdInternal_, _mm512_swizzle_pd(v0.simdInternal_, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_mask_add_pd(t0, _mm512_int2mask(0xCC), v1.simdInternal_, _mm512_swizzle_pd(v1.simdInternal_, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
+    t0 = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(t0), _mm512_int2mask(0xCCCC),
+                                                      _mm512_castpd_ps(t0), _MM_PERM_DCDC));
+
+    t1 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+    t1 = _mm512_add_pd(t1, t0);
+    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t1);
+
+    t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
+
+    _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x03), t0);
+    return d;
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h
new file mode 100644 (file)
index 0000000..ad50659
--- /dev/null
@@ -0,0 +1,465 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
+#define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_float.h"
+
+namespace gmx
+{
+
+// On MIC it is better to use scatter operations, so we define the load routines
+// that use a SIMD offset variable first.
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const float *  base,
+                             SimdFInt32     simdoffset,
+                             SimdFloat *    v0,
+                             SimdFloat *    v1,
+                             SimdFloat *    v2,
+                             SimdFloat *    v3)
+{
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 4 == 0);
+
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    if (align == 4)
+    {
+        simdoffset = simdoffset << 2;
+    }
+    else if (align == 8)
+    {
+        simdoffset = simdoffset << 3;
+    }
+    else
+    {
+        simdoffset = simdoffset * SimdFInt32(align);
+    }
+
+    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
+    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
+    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
+    v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+3, sizeof(float));
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUBySimdIntTranspose(const float *  base,
+                              SimdFInt32     simdoffset,
+                              SimdFloat *    v0,
+                              SimdFloat *    v1)
+{
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    // For align == 2 we can merge the constant into the scale parameter,
+    // which can take constants up to 8 in total.
+    if (align == 2)
+    {
+        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   align * sizeof(float));
+        v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, align * sizeof(float));
+    }
+    else
+    {
+        if (align == 4)
+        {
+            simdoffset = simdoffset << 2;
+        }
+        else if (align == 8)
+        {
+            simdoffset = simdoffset << 3;
+        }
+        else
+        {
+            simdoffset = simdoffset * SimdFInt32(align);
+        }
+        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
+        v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
+    }
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const float *   base,
+                             SimdFInt32      simdoffset,
+                             SimdFloat *     v0,
+                             SimdFloat *     v1)
+{
+    assert(std::size_t(base) % 8 == 0);
+    assert(align % 2 == 0);
+    gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const float *        base,
+                    const std::int32_t   offset[],
+                    SimdFloat *          v0,
+                    SimdFloat *          v1,
+                    SimdFloat *          v2,
+                    SimdFloat *          v3)
+{
+    gatherLoadBySimdIntTranspose<align>(base, loadFI(offset), v0, v1, v2, v3);
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const float *        base,
+                    const std::int32_t   offset[],
+                    SimdFloat *          v0,
+                    SimdFloat *          v1)
+{
+    gatherLoadBySimdIntTranspose<align>(base, loadFI(offset), v0, v1);
+}
+
+static const int c_simdBestPairAlignmentFloat = 2;
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTranspose(const float *        base,
+                     const std::int32_t   offset[],
+                     SimdFloat *          v0,
+                     SimdFloat *          v1,
+                     SimdFloat *          v2)
+{
+    SimdFInt32 simdoffset;
+
+    assert(std::size_t(offset) % 64 == 0);
+
+    simdoffset = loadFI(offset);
+
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    if (align == 4)
+    {
+        simdoffset = simdoffset << 2;
+    }
+    else if (align == 8)
+    {
+        simdoffset = simdoffset << 3;
+    }
+    else
+    {
+        simdoffset = simdoffset * SimdFInt32(align);
+    }
+
+    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base,   sizeof(float));
+    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
+    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterStoreU(float *              base,
+                       const std::int32_t   offset[],
+                       SimdFloat            v0,
+                       SimdFloat            v1,
+                       SimdFloat            v2)
+{
+    SimdFInt32 simdoffset;
+
+    assert(sdt::size_t(offset) % 64 == 0);
+
+    simdoffset = loadFI(offset);
+
+    // All instructions might be latency ~4 on MIC, so we use shifts where we
+    // only need a single instruction (since the shift parameter is an immediate),
+    // but multiplication otherwise.
+    if (align == 4)
+    {
+        simdoffset = simdoffset << 2;
+    }
+    else if (align == 8)
+    {
+        simdoffset = simdoffset << 3;
+    }
+    else
+    {
+        simdoffset = simdoffset * SimdFInt32(align);
+    }
+
+    _mm512_i32scatter_ps(base,   simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
+    _mm512_i32scatter_ps(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
+    _mm512_i32scatter_ps(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterIncrU(float *              base,
+                      const std::int32_t   offset[],
+                      SimdFloat            v0,
+                      SimdFloat            v1,
+                      SimdFloat            v2)
+{
+    GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata0[GMX_SIMD_FLOAT_WIDTH];
+    GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata1[GMX_SIMD_FLOAT_WIDTH];
+    GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata2[GMX_SIMD_FLOAT_WIDTH];
+
+    store(rdata0, v0);
+    store(rdata1, v1);
+    store(rdata2, v2);
+
+    for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        base[ align * offset[i] + 0] += rdata0[i];
+        base[ align * offset[i] + 1] += rdata1[i];
+        base[ align * offset[i] + 2] += rdata2[i];
+    }
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterDecrU(float *              base,
+                      const std::int32_t   offset[],
+                      SimdFloat            v0,
+                      SimdFloat            v1,
+                      SimdFloat            v2)
+{
+    GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata0[GMX_SIMD_FLOAT_WIDTH];
+    GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata1[GMX_SIMD_FLOAT_WIDTH];
+    GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH)  rdata2[GMX_SIMD_FLOAT_WIDTH];
+
+    store(rdata0, v0);
+    store(rdata1, v1);
+    store(rdata2, v2);
+
+    for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        base[ align * offset[i] + 0] -= rdata0[i];
+        base[ align * offset[i] + 1] -= rdata1[i];
+        base[ align * offset[i] + 2] -= rdata2[i];
+    }
+}
+
+static inline void gmx_simdcall
+expandScalarsToTriplets(SimdFloat    scalar,
+                        SimdFloat *  triplets0,
+                        SimdFloat *  triplets1,
+                        SimdFloat *  triplets2)
+{
+    triplets0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
+                                                                           _mm512_castps_si512(scalar.simdInternal_)));
+    triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
+                                                                           _mm512_castps_si512(scalar.simdInternal_)));
+    triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
+                                                                           _mm512_castps_si512(scalar.simdInternal_)));
+}
+
+
+static inline float gmx_simdcall
+reduceIncr4ReturnSum(float *    m,
+                     SimdFloat  v0,
+                     SimdFloat  v1,
+                     SimdFloat  v2,
+                     SimdFloat  v3)
+{
+    float  f;
+    __m512 t0, t1, t2, t3;
+
+    assert(std::size_t(m) % 16 == 0);
+
+    t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v2.simdInternal_, _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
+    t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
+    t1 = _mm512_mask_add_ps(t1, _mm512_int2mask(0xCCCC), v3.simdInternal_, _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
+    t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
+    t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
+
+    t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
+    t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
+
+    t0 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+    t0 = _mm512_add_ps(t0, t2);
+    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
+
+    t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_BADC));
+    t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_CDAB));
+
+    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t2);
+    return f;
+}
+
+static inline SimdFloat gmx_simdcall
+loadDualHsimd(const float * m0,
+              const float * m1)
+{
+    assert(std::size_t(m0) % 32 == 0);
+    assert(std::size_t(m1) % 32 == 0);
+
+    return _mm512_castpd_ps(_mm512_mask_extload_pd(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
+                                                   _mm512_int2mask(0xF0), reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+}
+
+static inline SimdFloat gmx_simdcall
+loadDuplicateHsimd(const float * m)
+{
+    assert(std::size_t(m) % 32 == 0);
+
+    return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+}
+
+static inline SimdFloat gmx_simdcall
+load1DualHsimd(const float * m)
+{
+    return _mm512_mask_extload_ps(_mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), _mm512_int2mask(0xFF00),
+                                  m+1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
+}
+
+
+static inline void gmx_simdcall
+storeDualHsimd(float *     m0,
+               float *     m1,
+               SimdFloat   a)
+{
+    __m512 t0;
+
+    assert(std::size_t(m0) % 32 == 0);
+    assert(std::size_t(m1) % 32 == 0);
+
+    _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), a.simdInternal_);
+    _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
+}
+
+static inline void gmx_simdcall
+incrDualHsimd(float *     m0,
+              float *     m1,
+              SimdFloat   a)
+{
+    assert(std::size_t(m0) % 32 == 0);
+    assert(std::size_t(m1) % 32 == 0);
+
+    __m512 x;
+
+    // Update lower half
+    x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    x = _mm512_add_ps(x, a.simdInternal_);
+    _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
+
+    // Update upper half
+    x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    x = _mm512_add_ps(x, a.simdInternal_);
+    _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
+}
+
+static inline void gmx_simdcall
+decrHsimd(float *    m,
+          SimdFloat  a)
+{
+    __m512 t;
+
+    assert(std::size_t(m) % 32 == 0);
+
+    t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+    a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
+    t = _mm512_sub_ps(t, a.simdInternal_);
+    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTransposeHsimd(const float *        base0,
+                         const float *        base1,
+                         const std::int32_t   offset[],
+                         SimdFloat *          v0,
+                         SimdFloat *          v1)
+{
+    __m512i idx0, idx1, idx;
+    __m512  tmp1, tmp2;
+
+    assert(std::size_t(offset) % 32 == 0);
+    assert(std::size_t(base0) % 8 == 0);
+    assert(std::size_t(base1) % 8 == 0);
+    assert(std::size_t(align) % 2 == 0);
+
+    idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset);
+
+    idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
+    idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
+
+    idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0xFF00), idx1, _MM_PERM_BABA);
+
+    tmp1 = _mm512_i32gather_ps(idx, base0, sizeof(float));
+    tmp2 = _mm512_i32gather_ps(idx, base1, sizeof(float));
+
+    v0->simdInternal_ = _mm512_mask_permute4f128_ps(tmp1, _mm512_int2mask(0xFF00), tmp2, _MM_PERM_BABA);
+    v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
+}
+
+static inline float gmx_simdcall
+reduceIncr4ReturnSumHsimd(float *     m,
+                          SimdFloat   v0,
+                          SimdFloat   v1)
+{
+    float  f;
+    __m512 t0, t1;
+
+    assert(std::size_t(m) % 32 == 0);
+
+    t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
+    t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
+    t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
+    t1 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+    t1 = _mm512_add_ps(t1, t0);
+    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
+
+    t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_BADC));
+    t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
+
+    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t0);
+    return f;
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
index 06cb29d570d7f7cdd7ccbc6712380873d5a7a1da..4dfd70e6da1c75e121243a8a742eea357ecec356 100644 (file)
  *  \{
  */
 
-// Many SIMD architectures other than reference are temporarily disabled in this commit
-#if GMX_SIMD_X86_AVX2_256
-#    include "impl_x86_avx2_256/impl_x86_avx2_256.h"
-#elif GMX_SIMD_X86_AVX_256
-#    include "impl_x86_avx_256/impl_x86_avx_256.h"
-#elif GMX_SIMD_X86_AVX_128_FMA
-#    include "impl_x86_avx_128_fma/impl_x86_avx_128_fma.h"
+#if GMX_SIMD_X86_SSE2
+#    include "impl_x86_sse2/impl_x86_sse2.h"
 #elif GMX_SIMD_X86_SSE4_1
 #    include "impl_x86_sse4_1/impl_x86_sse4_1.h"
-#elif GMX_SIMD_X86_SSE2
-#    include "impl_x86_sse2/impl_x86_sse2.h"
-#elif GMX_SIMD_ARM_NEON_ASIMD
-#    include "impl_arm_neon_asimd/impl_arm_neon_asimd.h"
+#elif GMX_SIMD_X86_AVX_128_FMA
+#    include "impl_x86_avx_128_fma/impl_x86_avx_128_fma.h"
+#elif GMX_SIMD_X86_AVX_256
+#    include "impl_x86_avx_256/impl_x86_avx_256.h"
+#elif GMX_SIMD_X86_AVX2_256
+#    include "impl_x86_avx2_256/impl_x86_avx2_256.h"
+#elif GMX_SIMD_X86_MIC
+#    include "impl_x86_mic/impl_x86_mic.h"
 #elif GMX_SIMD_ARM_NEON
 #    include "impl_arm_neon/impl_arm_neon.h"
+#elif GMX_SIMD_ARM_NEON_ASIMD
+#    include "impl_arm_neon_asimd/impl_arm_neon_asimd.h"
 #elif GMX_SIMD_IBM_QPX
 #    include "impl_ibm_qpx/impl_ibm_qpx.h"
 #elif GMX_SIMD_IBM_VMX
index de50bf1fda253696fe377df3d8efa55fc053f5d4..6eef2e2a95fd7b5b69fada291d053f7bb5433cd9 100644 (file)
@@ -756,6 +756,49 @@ TEST_F(SimdFloatingpointUtilTest, storeDualHsimd)
     }
 }
 
+TEST_F(SimdFloatingpointUtilTest, incrDualHsimd)
+{
+    real            reference[GMX_SIMD_REAL_WIDTH];
+    SimdReal        v0;
+
+    // Create reference values
+    for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        reference[i] = val0_[i] + val2_[i];
+    }
+
+    // Point p to the upper half of val0_
+    real * p = val0_ + GMX_SIMD_REAL_WIDTH / 2;
+
+    v0 = load(val2_);
+    incrDualHsimd(val0_, p, v0);
+
+    for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        EXPECT_EQ(reference[i], val0_[i]);
+    }
+}
+
+TEST_F(SimdFloatingpointUtilTest, incrDualHsimdOverlapping)
+{
+    real            reference[GMX_SIMD_REAL_WIDTH/2];
+    SimdReal        v0;
+
+    // Create reference values
+    for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
+    {
+        reference[i] = val0_[i] + val2_[i] + val2_[GMX_SIMD_REAL_WIDTH/2+i];
+    }
+
+    v0 = load(val2_);
+    incrDualHsimd(val0_, val0_, v0);
+
+    for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
+    {
+        EXPECT_EQ(reference[i], val0_[i]);
+    }
+}
+
 TEST_F(SimdFloatingpointUtilTest, decrHsimd)
 {
     SimdReal                          v0;