src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h: warning: should include "nbnxn_simd.h"
# Temporary while we change the SIMD implementation
-src/gromacs/simd/impl_intel_mic/impl_intel_mic_common.h: warning: should include "simd.h"
src/gromacs/simd/impl_sparc64_hpc_ace/impl_sparc64_hpc_ace_common.h: warning: should include "simd.h"
src/gromacs/simd/impl_x86_avx_512f/impl_x86_avx_512f_common.h: warning: should include "simd.h"
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD4_DOUBLE_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD4_DOUBLE_H
-
-#include "config.h"
-
-#include <assert.h>
-#include <math.h>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-#include "impl_intel_mic_simd_double.h"
-
-/****************************************************
- * DOUBLE PRECISION SIMD4 IMPLEMENTATION *
- ****************************************************/
-#define Simd4Double __m512d
-#define simd4Mask _mm512_int2mask(0xF)
-#define simd4LoadD(m) _mm512_mask_extload_pd(_mm512_undefined_pd(), simd4Mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
-#define simd4Load1D(m) _mm512_mask_extload_pd(_mm512_undefined_pd(), simd4Mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
-#define simd4Set1D _mm512_set1_pd
-#define simd4StoreD simd4StoreD_mic
-#define simd4LoadUD simd4LoadUD_mic
-#define simd4StoreUD simd4StoreUD_mic
-#define simd4SetZeroD _mm512_setzero_pd
-#define simd4AddD(a, b) _mm512_mask_add_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4SubD(a, b) _mm512_mask_sub_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4MulD(a, b) _mm512_mask_mul_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4FmaddD(a, b, c) _mm512_mask_fmadd_pd(a, simd4Mask, b, c)
-#define simd4FmsubD(a, b, c) _mm512_mask_fmsub_pd(a, simd4Mask, b, c)
-#define simd4FnmaddD(a, b, c) _mm512_mask_fnmadd_pd(a, simd4Mask, b, c)
-#define simd4FnmsubD(a, b, c) _mm512_mask_fnmsub_pd(a, simd4Mask, b, c)
-#define simd4AndD(a, b) _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4AndNotD(a, b) _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4OrD(a, b) _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4XorD(a, b) _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simd4RsqrtD(a) _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(), simd4Mask, _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), simd4Mask, _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(), simd4Mask, x)))
-#define simd4AbsD(x) simd4AndNotD(_mm512_set1_pd(GMX_DOUBLE_NEGZERO), x)
-#define simd4NegD(x) _mm512_mask_addn_pd(_mm512_undefined_pd(), simd4Mask, x, _mm512_setzero_pd())
-#define simd4MaxD(a, b) _mm512_mask_gmax_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4MinD(a, b) _mm512_mask_gmin_pd(_mm512_undefined_pd(), simd4Mask, a, b)
-#define simd4RoundD(a) _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), simd4Mask, a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simd4TruncD(a) _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), simd4Mask, a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simd4DotProductD(a, b) _mm512_mask_reduce_add_pd(_mm512_int2mask(7), _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7), a, b))
-#define Simd4DBool __mmask16
-#define simd4CmpEqD(a, b) _mm512_mask_cmp_pd_mask(simd4Mask, a, b, _CMP_EQ_OQ)
-#define simd4CmpLtD(a, b) _mm512_mask_cmp_pd_mask(simd4Mask, a, b, _CMP_LT_OS)
-#define simd4CmpLeD(a, b) _mm512_mask_cmp_pd_mask(simd4Mask, a, b, _CMP_LE_OS)
-#define simd4AndDB _mm512_kand
-#define simd4OrDB _mm512_kor
-#define simd4AnyTrueDB(x) (_mm512_mask2int(x)&0xF)
-#define simd4MaskD(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
-#define simd4MaskNotD(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
-#define simd4BlendD(a, b, sel) _mm512_mask_blend_pd(sel, a, b)
-#define simd4ReduceD(x) _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), x)
-
-/* Implementation helpers */
-static inline void gmx_simdcall
-simd4StoreD_mic(double * m, __m512d s)
-{
- assert((size_t)m%32 == 0);
- _mm512_mask_packstorelo_pd(m, simd4Mask, s);
-}
-
-static inline __m512d gmx_simdcall
-simd4LoadUD_mic(const double * m)
-{
- return _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), simd4Mask, m), simd4Mask, m+8);
-}
-
-static inline void gmx_simdcall
-simd4StoreUD_mic(double * m, __m512d s)
-{
- _mm512_mask_packstorelo_pd(m, simd4Mask, s);
- _mm512_mask_packstorehi_pd(m+8, simd4Mask, s);
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD4_DOUBLE_H */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
-
-#include "config.h"
-
-#include <assert.h>
-#include <math.h>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-#include "impl_intel_mic_simd_float.h"
-
-/****************************************************
- * SINGLE PRECISION SIMD4 IMPLEMENTATION *
- ****************************************************/
-/* Load and store are guranteed to only access the 4 floats. All arithmetic operations
- only operate on the 4 elements (to avoid floating excpetions). But other operations
- are not gurateed to not modify the other 12 elements. E.g. setzero or blendzero
- set the upper 12 to zero. */
-#define Simd4Float __m512
-#define simd4Mask _mm512_int2mask(0xF)
-#define simd4LoadF(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
-#define simd4Load1F(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
-#define simd4Set1F _mm512_set1_ps
-#define simd4StoreF simd4StoreF_mic
-#define simd4LoadUF simd4LoadUF_mic
-#define simd4StoreUF simd4StoreUF_mic
-#define simd4SetZeroF _mm512_setzero_ps
-#define simd4AddF(a, b) _mm512_mask_add_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4SubF(a, b) _mm512_mask_sub_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4MulF(a, b) _mm512_mask_mul_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4FmaddF(a, b, c) _mm512_mask_fmadd_ps(a, simd4Mask, b, c)
-#define simd4FmsubF(a, b, c) _mm512_mask_fmsub_ps(a, simd4Mask, b, c)
-#define simd4FnmaddF(a, b, c) _mm512_mask_fnmadd_ps(a, simd4Mask, b, c)
-#define simd4FnmsubF(a, b, c) _mm512_mask_fnmsub_ps(a, simd4Mask, b, c)
-#define simd4AndF(a, b) _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4AndNotF(a, b) _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4OrF(a, b) _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4XorF(a, b) _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simd4RsqrtF(a) _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), simd4Mask, a)
-#define simd4AbsF(x) simd4AndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
-#define simd4NegF(x) _mm512_mask_addn_ps(_mm512_undefined_ps(), simd4Mask, x, _mm512_setzero_ps())
-#define simd4MaxF(a, b) _mm512_mask_gmax_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4MinF(a, b) _mm512_mask_gmin_ps(_mm512_undefined_ps(), simd4Mask, a, b)
-#define simd4RoundF(x) _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simd4TruncF(x) _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simd4DotProductF(a, b) _mm512_mask_reduce_add_ps(_mm512_int2mask(7), _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(7), a, b))
-#define Simd4FBool __mmask16
-#define simd4CmpEqF(a, b) _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_EQ_OQ)
-#define simd4CmpLtF(a, b) _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LT_OS)
-#define simd4CmpLeF(a, b) _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LE_OS)
-#define simd4AndFB _mm512_kand
-#define simd4OrFB _mm512_kor
-#define simd4AnyTrueFB(x) (_mm512_mask2int(x)&0xF)
-#define simd4MaskF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
-#define simd4MaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
-#define simd4BlendF(a, b, sel) _mm512_mask_blend_ps(sel, a, b)
-#define simd4ReduceF(x) _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x)
-
-/* Implementation helpers */
-
-/* load store simd4 */
-static inline void gmx_simdcall
-simd4StoreF_mic(float * m, __m512 s)
-{
- assert((size_t)m%16 == 0);
- _mm512_mask_packstorelo_ps(m, simd4Mask, s);
-}
-
-static inline __m512 gmx_simdcall
-simd4LoadUF_mic(const float * m)
-{
- return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), simd4Mask, m), simd4Mask, m+16);
-}
-
-static inline void gmx_simdcall
-simd4StoreUF_mic(float * m, __m512 s)
-{
- _mm512_mask_packstorelo_ps(m, simd4Mask, s);
- _mm512_mask_packstorehi_ps(m+16, simd4Mask, s);
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD_DOUBLE_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD_DOUBLE_H
-
-#include "config.h"
-
-#include <assert.h>
-
-#include <cmath>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-
-/****************************************************
- * DOUBLE PRECISION SIMD IMPLEMENTATION *
- ****************************************************/
-#define SimdDouble __m512d
-#define simdLoadD _mm512_load_pd
-#define simdLoad1D(m) _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
-#define simdSet1D _mm512_set1_pd
-#define simdStoreD _mm512_store_pd
-#define simdLoadUD simdLoadUD_mic
-#define simdStoreUD simdStoreUD_mic
-#define simdSetZeroD _mm512_setzero_pd
-#define simdAddD _mm512_add_pd
-#define simdSubD _mm512_sub_pd
-#define simdMulD _mm512_mul_pd
-#define simdFmaddD _mm512_fmadd_pd
-#define simdFmsubD _mm512_fmsub_pd
-#define simdFnmaddD _mm512_fnmadd_pd
-#define simdFnmsubD _mm512_fnmsub_pd
-#define simdAndD(a, b) _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdAndNotD(a, b) _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdOrD(a, b) _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdXorD(a, b) _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
-#define simdRsqrtD(x) _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x)))
-#define simdRcpD(x) _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x)))
-#define simdAbsD(x) simdAndNotD(_mm512_set1_pd(GMX_DOUBLE_NEGZERO), x)
-#define simdNegD(x) _mm512_addn_pd(x, _mm512_setzero_pd())
-#define simdMaxD _mm512_gmax_pd
-#define simdMinD _mm512_gmin_pd
-#define simdRoundD(a) _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simdTruncD(a) _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simdFractionD(x) _mm512_sub_pd(x, simdTruncD(x))
-#define simdGetExponentD(x) _mm512_getexp_pd(x)
-#define simdGetMantissaD(x) _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
-#define simdSetExponentD(x) simdSetExponentD_mic(x)
-/* integer datatype corresponding to float: SimdFInt32
- Doesn't use mask other than where required. No side effect expected for operating on the (unused) upper 8.
- */
-#define SimdDInt32 __m512i
-#define simdLoadDI(m) _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
-#define simdSet1DI _mm512_set1_epi32
-#define simdStoreDI simdStoreDI_mic
-#define simdLoadUDI simdLoadUDI_mic
-#define simdStoreUDI simdStoreUDI_mic
-#define simdExtractDI simdExtractDI_mic
-#define simdSetZeroDI _mm512_setzero_epi32
-#define simdCvtD2I(a) _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT)
-#define simdCvttD2I(a) _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_ZERO)
-#define simdCvtI2D _mm512_cvtepi32lo_pd
-/* Integer logical ops on SimdFInt32 */
-#define simdSlliDI _mm512_slli_epi32
-#define simdSrliDI _mm512_srli_epi32
-#define simdAndDI _mm512_and_epi32
-#define simdAndNotDI _mm512_andnot_epi32
-#define simdOrDI _mm512_or_epi32
-#define simdXorDI _mm512_xor_epi32
-/* Integer arithmetic ops on SimdFInt32 */
-#define simdAddDI _mm512_add_epi32
-#define simdSubDI _mm512_sub_epi32
-#define simdMulDI _mm512_mullo_epi32
-/* Boolean & comparison operations on SimdFloat */
-#define SimdDBool __mmask8
-#define simdCmpEqD(a, b) _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)
-#define simdCmpLtD(a, b) _mm512_cmp_pd_mask(a, b, _CMP_LT_OS)
-#define simdCmpLeD(a, b) _mm512_cmp_pd_mask(a, b, _CMP_LE_OS)
-#define simdAndDB _mm512_kand
-#define simdOrDB _mm512_kor
-#define simdAnyTrueDB(x) _mm512_mask2int(x)
-#define simdMaskD(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
-#define simdMaskNotD(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
-#define simdBlendD(a, b, sel) _mm512_mask_blend_pd(sel, a, b)
-#define simdReduceD(a) _mm512_reduce_add_pd(a)
-/* Boolean & comparison operations on SimdFInt32 */
-#define SimdDIBool __mmask16
-#define simdCmpEqDI(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
-#define simdCmpLtDI(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
-#define simdAndDIB _mm512_kand
-#define simdOrDIB _mm512_kor
-#define simdAnyTrueDIB(x) (_mm512_mask2int(x)&0xFF)
-#define simdMaskDI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
-#define simdMaskNotDI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
-#define simdBlendDI(a, b, sel) _mm512_mask_blend_epi32(sel, a, b)
-/* Conversions between booleans. Double & dint stuff is stored in low bits */
-#define simdCvtDB2DIB(x) (x)
-#define simdCvtDIB2DB(x) (x)
-
-/* Float/double conversion */
-#define simdCvtF2DD simdCvtF2DD_mic
-#define simdCvtDD2F simdCvtDD2F_mic
-
-
-#define PERM_LOW2HIGH _MM_PERM_BABA
-#define PERM_HIGH2LOW _MM_PERM_DCDC
-
-#define mask_loh _mm512_int2mask(0x00FF) /* would be better a constant - but can't initialize with a function call. */
-#define mask_hih _mm512_int2mask(0xFF00)
-
-/* load store double */
-static inline __m512d gmx_simdcall
-simdLoadUD_mic(const double * m)
-{
- return _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8);
-}
-
-static inline void gmx_simdcall
-simdStoreUD_mic(double * m, __m512d s)
-{
- _mm512_packstorelo_pd(m, s);
- _mm512_packstorehi_pd(m+8, s);
-}
-
-/* load store dint32 */
-static inline void gmx_simdcall
-simdStoreDI_mic(std::int32_t * m, __m512i s)
-{
- assert((size_t)m%32 == 0);
- _mm512_mask_packstorelo_epi32(m, mask_loh, s);
-}
-
-static inline __m512i gmx_simdcall
-simdLoadUDI_mic(const std::int32_t * m)
-{
- return _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m), mask_loh, m+16);
-}
-
-static inline void gmx_simdcall
-simdStoreUDI_mic(std::int32_t * m, __m512i s)
-{
- _mm512_mask_packstorelo_epi32(m, mask_loh, s);
- _mm512_mask_packstorehi_epi32(m+16, mask_loh, s);
-}
-
-static inline std::int32_t gmx_simdcall
-simdExtractDI_mic(SimdDInt32 a, int index)
-{
- int r;
- _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
- return r;
-}
-
-/* This is likely faster than the built in scale operation (lat 8, t-put 3)
- * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
- */
-static inline __m512d gmx_simdcall
-simdSetExponentD_mic(__m512d a)
-{
- const __m512i expbias = _mm512_set1_epi32(1023);
- __m512i iexp = _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT);
- iexp = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), iexp);
- iexp = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iexp, expbias), 20);
- return _mm512_castsi512_pd(iexp);
-}
-
-static inline void gmx_simdcall
-simdCvtF2DD_mic(__m512 f, __m512d * d0, __m512d * d1)
-{
- __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f), PERM_HIGH2LOW);
-
- *d0 = _mm512_cvtpslo_pd(f);
- *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
-}
-
-static inline __m512 gmx_simdcall
-simdCvtDD2F_mic(__m512d d0, __m512d d1)
-{
- __m512 f0 = _mm512_cvtpd_pslo(d0);
- __m512 f1 = _mm512_cvtpd_pslo(d1);
- return _mm512_mask_permute4f128_ps(f0, mask_hih, f1, PERM_LOW2HIGH);
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD_DOUBLE_H */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
-#define GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
-
-#include "config.h"
-
-#include <cmath>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "impl_intel_mic_common.h"
-
-/****************************************************
- * SINGLE PRECISION SIMD IMPLEMENTATION *
- ****************************************************/
-#define SimdFloat __m512
-#define simdLoadF _mm512_load_ps
-#define simdLoad1F(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
-#define simdSet1F _mm512_set1_ps
-#define simdStoreF _mm512_store_ps
-#define simdLoadUF simdLoadUF_mic
-#define simdStoreUF simdStoreUF_mic
-#define simdSetZeroF _mm512_setzero_ps
-#define simdAddF _mm512_add_ps
-#define simdSubF _mm512_sub_ps
-#define simdMulF _mm512_mul_ps
-#define simdFmaddF _mm512_fmadd_ps
-#define simdFmsubF _mm512_fmsub_ps
-#define simdFnmaddF _mm512_fnmadd_ps
-#define simdFnmsubF _mm512_fnmsub_ps
-#define simdAndF(a, b) _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdAndNotF(a, b) _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdOrF(a, b) _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdXorF(a, b) _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
-#define simdRsqrtF _mm512_rsqrt23_ps
-#define simdRcpF _mm512_rcp23_ps
-#define simdAbsF(x) simdAndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
-#define simdNegF(x) _mm512_addn_ps(x, _mm512_setzero_ps())
-#define simdMaxF _mm512_gmax_ps
-#define simdMinF _mm512_gmin_ps
-#define simdRoundF(x) _mm512_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simdTruncF(x) _mm512_round_ps(x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simdFractionF(x) _mm512_sub_ps(x, simdTruncF(x))
-#define simdGetExponentF(x) _mm512_getexp_ps(x)
-#define simdGetMantissaF(x) _mm512_getmant_ps(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
-#define simdSetExponentF(x) simdSetExponentF_mic(x)
-/* integer datatype corresponding to float: SimdFInt32 */
-#define SimdFInt32 __m512i
-#define simdLoadFI _mm512_load_epi32
-#define simdSet1FI _mm512_set1_epi32
-#define simdStoreFI _mm512_store_epi32
-#define simdLoadUFI simdLoadUFI_mic
-#define simdStoreUFI simdStoreUFI_mic
-#define simdExtractFI simdExtractFI_mic
-#define simdSetZeroFI _mm512_setzero_epi32
-#define simdCvtF2I(a) _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-#define simdCvttF2I(a) _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
-#define simdCvtI2F(a) _mm512_cvtfxpnt_round_adjustepi32_ps(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
-/* Integer logical ops on SimdFInt32 */
-#define simdSlliFI _mm512_slli_epi32
-#define simdSrliFI _mm512_srli_epi32
-#define simdAndFI _mm512_and_epi32
-#define simdAndNotFI _mm512_andnot_epi32
-#define simdOrFI _mm512_or_epi32
-#define simdXorFI _mm512_xor_epi32
-/* Integer arithmetic ops on SimdFInt32 */
-#define simdAddFI _mm512_add_epi32
-#define simdSubFI _mm512_sub_epi32
-#define simdMulFI _mm512_mullo_epi32
-/* Boolean & comparison operations on SimdFloat */
-#define SimdFBool __mmask16
-#define simdCmpEqF(a, b) _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
-#define simdCmpLtF(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
-#define simdCmpLeF(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
-#define simdAndFB _mm512_kand
-#define simdAndNotFB(a, b) _mm512_knot(_mm512_kor(a, b))
-#define simdOrFB _mm512_kor
-#define simdAnyTrueFB _mm512_mask2int
-#define simdMaskF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
-#define simdMaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
-#define simdBlendF(a, b, sel) _mm512_mask_blend_ps(sel, a, b)
-#define simdReduceF(a) _mm512_reduce_add_ps(a)
-/* Boolean & comparison operations on SimdFInt32 */
-#define SimdFIBool __mmask16
-#define simdCmpEqFI(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
-#define simdCmpLtFI(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
-#define simdAndFIB _mm512_kand
-#define simdOrFIB _mm512_kor
-#define simdAnyTrueFIB _mm512_mask2int
-#define simdMaskFI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
-#define simdMaskNotFI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
-#define simdBlendFI(a, b, sel) _mm512_mask_blend_epi32(sel, a, b)
-/* Conversions between different booleans */
-#define simdCvtFB2FIB(x) (x)
-#define simdCvtFIB2FB(x) (x)
-
-/* MIC provides full single precision of some neat functions: */
-/* 1/sqrt(x) and 1/x work fine in simd_math.h, and won't use extra iterations */
-#define simdExp2F simdExp2F_mic
-#define simdExpF simdExpF_mic
-#define simdLogF simdLogF_mic
-
-/* load store float */
-static inline __m512 gmx_simdcall
-simdLoadUF_mic(const float * m)
-{
- return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
-}
-
-static inline void gmx_simdcall
-simdStoreUF_mic(float * m, __m512 s)
-{
- _mm512_packstorelo_ps(m, s);
- _mm512_packstorehi_ps(m+16, s);
-}
-
-/* load store fint32 */
-static inline __m512i gmx_simdcall
-simdLoadUFI_mic(const std::int32_t * m)
-{
- return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
-}
-
-static inline void gmx_simdcall
-simdStoreUFI_mic(std::int32_t * m, __m512i s)
-{
- _mm512_packstorelo_epi32(m, s);
- _mm512_packstorehi_epi32(m+16, s);
-}
-
-/* extract */
-static inline std::int32_t gmx_simdcall
-simdExtractFI_mic(SimdFInt32 a, int index)
-{
- int r;
- _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
- return r;
-}
-
-/* This is likely faster than the built in scale operation (lat 8, t-put 3)
- * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
- */
-static inline __m512 gmx_simdcall
-simdSetExponentF_mic(__m512 a)
-{
- __m512i iexp = simdCvtF2I(a);
-
- const __m512i expbias = _mm512_set1_epi32(127);
- iexp = _mm512_slli_epi32(_mm512_add_epi32(iexp, expbias), 23);
- return _mm512_castsi512_ps(iexp);
-
- /* scale alternative:
- return _mm512_scale_ps(_mm512_set1_ps(1), iexp);
- */
-}
-
-static inline __m512 gmx_simdcall
-simdExp2F_mic(__m512 x)
-{
- return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
-}
-
-static inline __m512 gmx_simdcall
-simdExpF_mic(__m512 x)
-{
- const SimdFloat argscale = simdSet1F(1.44269504088896341f);
- const SimdFloat invargscale = simdSet1F(-0.69314718055994528623f);
- __m512 xscaled = _mm512_mul_ps(x, argscale);
- __m512 r = simdExp2F_mic(xscaled);
-
- /* simdExp2F_mic() provides 23 bits of accuracy, but we ruin some of that
- * with the argument scaling due to single-precision rounding, where the
- * rounding error is amplified exponentially. To correct this, we find the
- * difference between the scaled argument and the true one (extended precision
- * arithmetics does not appear to be necessary to fulfill our accuracy requirements)
- * and then multiply by the exponent of this correction since exp(a+b)=exp(a)*exp(b).
- * Note that this only adds two instructions (and maybe some constant loads).
- */
- x = simdFmaddF(invargscale, xscaled, x);
- /* x will now be a _very_ small number, so approximate exp(x)=1+x.
- * We should thus apply the correction as r'=r*(1+x)=r+r*x
- */
- r = simdFmaddF(r, x, r);
- return r;
-}
-
-static inline __m512 gmx_simdcall
-simdLogF_mic(__m512 x)
-{
- return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
-}
-
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_H
+#define GMX_SIMD_IMPL_X86_MIC_H
+
+#include "impl_x86_mic_definitions.h"
+#include "impl_x86_mic_general.h"
+#include "impl_x86_mic_simd4_double.h"
+#include "impl_x86_mic_simd4_float.h"
+#include "impl_x86_mic_simd_double.h"
+#include "impl_x86_mic_simd_float.h"
+#include "impl_x86_mic_util_double.h"
+#include "impl_x86_mic_util_float.h"
+
+#endif // GMX_SIMD_IMPL_X86_MIC_H
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_COMMON_H
-#define GMX_SIMD_IMPL_INTEL_MIC_COMMON_H
+#ifndef GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
+#define GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
-/* Intel Xeon Phi, or
- * the-artist-formerly-known-as-Knight's-corner, or
- * the-artist-formerly-formerly-known-as-MIC, or
- * the artist formerly-formerly-formerly-known-as-Larrabee
- * 512-bit SIMD instruction wrappers.
- */
-
-/* Capability definitions for Xeon Phi SIMD */
#define GMX_SIMD 1
#define GMX_SIMD_HAVE_FLOAT 1
#define GMX_SIMD_HAVE_DOUBLE 1
#define GMX_SIMD_HAVE_DINT32_EXTRACT 1
#define GMX_SIMD_HAVE_DINT32_LOGICAL 1
#define GMX_SIMD_HAVE_DINT32_ARITHMETICS 1
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT 0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT 0
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT 0
+#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT 1
+#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT 1
+#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT 1
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE 0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE 0
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE 0
+#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE 0
+#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE 0
+#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
+#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
+
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1
-/* Implementation details */
+// Implementation details
#define GMX_SIMD_FLOAT_WIDTH 16
#define GMX_SIMD_DOUBLE_WIDTH 8
#define GMX_SIMD_FINT32_WIDTH 16
#define GMX_SIMD_RSQRT_BITS 23
#define GMX_SIMD_RCP_BITS 23
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_COMMON_H */
+#endif // GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef GMX_SIMD_IMPL_INTEL_MIC_H
-#define GMX_SIMD_IMPL_INTEL_MIC_H
+#ifndef GMX_SIMD_IMPL_X86_MIC_GENERAL_H
+#define GMX_SIMD_IMPL_X86_MIC_GENERAL_H
-#include "impl_intel_mic_simd4_double.h"
-#include "impl_intel_mic_simd4_float.h"
-#include "impl_intel_mic_simd_double.h"
-#include "impl_intel_mic_simd_float.h"
+#include <immintrin.h>
-#endif /* GMX_SIMD_IMPL_INTEL_MIC_H */
+namespace gmx
+{
+
+static inline void
+simdPrefetch(const void * m)
+{
+ _mm_prefetch((const char *)m, _MM_HINT_T0);
+}
+
+} // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_OTHER_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_double.h"
+
+namespace gmx
+{
+
+class Simd4Double
+{
+ public:
+ Simd4Double() {}
+
+ Simd4Double(double d) : simdInternal_(_mm512_set1_pd(d)) {}
+
+ // Internal utility constructor to simplify return statements
+ Simd4Double(__m512d simd) : simdInternal_(simd) {}
+
+ __m512d simdInternal_;
+};
+
+class Simd4DBool
+{
+ public:
+ Simd4DBool() {}
+
+ // Internal utility constructor to simplify return statements
+ Simd4DBool(__mmask16 simd) : simdInternal_(simd) {}
+
+ __mmask16 simdInternal_;
+};
+
+static inline Simd4Double gmx_simdcall
+load4(const double *m)
+{
+ assert(size_t(m) % 32 == 0);
+ return {
+ _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
+ };
+}
+
+static inline void gmx_simdcall
+store4(double *m, Simd4Double a)
+{
+ assert(size_t(m) % 32 == 0);
+ _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Double gmx_simdcall
+load4U(const double *m)
+{
+ return {
+ _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m), _mm512_int2mask(0xF), m+8)
+ };
+}
+
+static inline void gmx_simdcall
+store4U(double *m, Simd4Double a)
+{
+ _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
+ _mm512_mask_packstorehi_pd(m+8, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Double gmx_simdcall
+simd4SetZeroD()
+{
+ return {
+ _mm512_setzero_pd()
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+operator&(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+ _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+andNot(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+ _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+operator|(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+ _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+operator^(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
+ _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+operator+(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_add_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+operator-(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_sub_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+operator-(Simd4Double x)
+{
+ return {
+ _mm512_mask_addn_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_pd())
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+operator*(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+fma(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+ return {
+ _mm512_mask_fmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+fms(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+ return {
+ _mm512_mask_fmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+fnma(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+ return {
+ _mm512_mask_fnmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+fnms(Simd4Double a, Simd4Double b, Simd4Double c)
+{
+ return {
+ _mm512_mask_fnmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+rsqrt(Simd4Double x)
+{
+ return {
+ _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(),
+ _mm512_int2mask(0xF),
+ _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(),
+ _mm512_int2mask(0xF),
+ _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(),
+ _mm512_int2mask(0xF), x.simdInternal_)))
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+abs(Simd4Double x)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF),
+ _mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)),
+ _mm512_castpd_si512(x.simdInternal_)))
+
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+max(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_gmax_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+min(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_gmin_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+round(Simd4Double x)
+{
+ return {
+ _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+trunc(Simd4Double x)
+{
+ return {
+ _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline float gmx_simdcall
+dotProduct(Simd4Double a, Simd4Double b)
+{
+ return _mm512_mask_reduce_add_pd(_mm512_int2mask(7),
+ _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7),
+ a.simdInternal_, b.simdInternal_));
+}
+
+static inline void gmx_simdcall
+transpose(Simd4Double * v0, Simd4Double * v1,
+ Simd4Double * v2, Simd4Double * v3)
+{
+ __m512i t0 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v0->simdInternal_), 0xFF00,
+ _mm512_castpd_si512(v1->simdInternal_), _MM_PERM_BABA);
+ __m512i t1 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v2->simdInternal_), 0xFF00,
+ _mm512_castpd_si512(v3->simdInternal_), _MM_PERM_BABA);
+
+ t0 = _mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t0);
+ t1 = _mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t1);
+
+ v0->simdInternal_ = _mm512_mask_swizzle_pd(_mm512_castsi512_pd(t0), _mm512_int2mask(0xCC),
+ _mm512_castsi512_pd(t1), _MM_SWIZ_REG_BADC);
+ v1->simdInternal_ = _mm512_mask_swizzle_pd(_mm512_castsi512_pd(t1), _mm512_int2mask(0x33),
+ _mm512_castsi512_pd(t0), _MM_SWIZ_REG_BADC);
+
+ v2->simdInternal_ = _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v0->simdInternal_), _MM_PERM_DCDC));
+ v3->simdInternal_ = _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v1->simdInternal_), _MM_PERM_DCDC));
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline Simd4DBool gmx_simdcall
+operator==(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+ };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator!=(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+ };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator<(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+ };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator<=(Simd4Double a, Simd4Double b)
+{
+ return {
+ _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+ };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator&&(Simd4DBool a, Simd4DBool b)
+{
+ return {
+ _mm512_kand(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4DBool gmx_simdcall
+operator||(Simd4DBool a, Simd4DBool b)
+{
+ return {
+ _mm512_kor(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline bool gmx_simdcall
+anyTrue(Simd4DBool a)
+{
+ return (_mm512_mask2int(a.simdInternal_) & 0xF) != 0;
+}
+
+static inline Simd4Double gmx_simdcall
+selectByMask(Simd4Double a, Simd4DBool m)
+{
+ return {
+ _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+selectByNotMask(Simd4Double a, Simd4DBool m)
+{
+ return {
+ _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(m.simdInternal_), a.simdInternal_)
+ };
+}
+
+static inline Simd4Double gmx_simdcall
+blend(Simd4Double a, Simd4Double b, Simd4DBool sel)
+{
+ return {
+ _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline float gmx_simdcall
+reduce(Simd4Double a)
+{
+ return _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), a.simdInternal_);
+}
+
+} // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_float.h"
+
+namespace gmx
+{
+
+class Simd4Float
+{
+ public:
+ Simd4Float() {}
+
+ Simd4Float(float f) : simdInternal_(_mm512_set1_ps(f)) {}
+
+ // Internal utility constructor to simplify return statements
+ Simd4Float(__m512 simd) : simdInternal_(simd) {}
+
+ __m512 simdInternal_;
+};
+
+class Simd4FBool
+{
+ public:
+ Simd4FBool() {}
+
+ // Internal utility constructor to simplify return statements
+ Simd4FBool(__mmask16 simd) : simdInternal_(simd) {}
+
+ __mmask16 simdInternal_;
+};
+
+static inline Simd4Float gmx_simdcall
+load4(const float *m)
+{
+ assert(size_t(m) % 16 == 0);
+ return {
+ _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
+ };
+}
+
+static inline void gmx_simdcall
+store4(float *m, Simd4Float a)
+{
+ assert(size_t(m) % 16 == 0);
+ _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Float gmx_simdcall
+load4U(const float *m)
+{
+ return {
+ _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m), _mm512_int2mask(0xF), m+16)
+ };
+}
+
+static inline void gmx_simdcall
+store4U(float *m, Simd4Float a)
+{
+ _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
+ _mm512_mask_packstorehi_ps(m+16, _mm512_int2mask(0xF), a.simdInternal_);
+}
+
+static inline Simd4Float gmx_simdcall
+simd4SetZeroF()
+{
+ return {
+ _mm512_setzero_ps()
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+operator&(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+andNot(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+operator|(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+operator^(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+operator+(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_add_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+operator-(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_sub_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+operator-(Simd4Float x)
+{
+ return {
+ _mm512_mask_addn_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_ps())
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+operator*(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+fma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+ return {
+ _mm512_mask_fmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+fms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+ return {
+ _mm512_mask_fmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+fnma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+ return {
+ _mm512_mask_fnmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+fnms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+ return {
+ _mm512_mask_fnmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+rsqrt(Simd4Float x)
+{
+ return {
+ _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+abs(Simd4Float x)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
+ _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
+ _mm512_castps_si512(x.simdInternal_)))
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+max(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_gmax_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+min(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_gmin_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+round(Simd4Float x)
+{
+ return {
+ _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF),
+ x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+trunc(Simd4Float x)
+{
+ return {
+ _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF),
+ x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline float gmx_simdcall
+dotProduct(Simd4Float a, Simd4Float b)
+{
+ __m512 x = _mm512_mask_mul_ps(_mm512_setzero_ps(), _mm512_int2mask(0x7), a.simdInternal_, b.simdInternal_);
+ x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
+ x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
+ float f;
+ _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
+ return f;
+}
+
+static inline void gmx_simdcall
+transpose(Simd4Float * v0, Simd4Float * v1,
+ Simd4Float * v2, Simd4Float * v3)
+{
+ v0->simdInternal_ = _mm512_mask_permute4f128_ps(v0->simdInternal_, _mm512_int2mask(0x00F0), v1->simdInternal_, _MM_PERM_AAAA);
+ v2->simdInternal_ = _mm512_mask_permute4f128_ps(v2->simdInternal_, _mm512_int2mask(0x00F0), v3->simdInternal_, _MM_PERM_AAAA);
+ v0->simdInternal_ = _mm512_mask_permute4f128_ps(v0->simdInternal_, _mm512_int2mask(0xFF00), v2->simdInternal_, _MM_PERM_BABA);
+ v0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0),
+ _mm512_castps_si512(v0->simdInternal_)));
+ v1->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_BBBB);
+ v2->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_CCCC);
+ v3->simdInternal_ = _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_DDDD);
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline Simd4FBool gmx_simdcall
+operator==(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+ };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator!=(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+ };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator<(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+ };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator<=(Simd4Float a, Simd4Float b)
+{
+ return {
+ _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+ };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator&&(Simd4FBool a, Simd4FBool b)
+{
+ return {
+ _mm512_kand(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator||(Simd4FBool a, Simd4FBool b)
+{
+ return {
+ _mm512_kor(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline bool gmx_simdcall
+anyTrue(Simd4FBool a)
+{
+ return ( _mm512_mask2int(a.simdInternal_) & 0xF) != 0;
+}
+
+static inline Simd4Float gmx_simdcall
+selectByMask(Simd4Float a, Simd4FBool m)
+{
+ return {
+ _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+selectByNotMask(Simd4Float a, Simd4FBool m)
+{
+ return {
+ _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(m.simdInternal_), a.simdInternal_)
+ };
+}
+
+static inline Simd4Float gmx_simdcall
+blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
+{
+ return {
+ _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline float gmx_simdcall
+reduce(Simd4Float a)
+{
+ __m512 x = a.simdInternal_;
+ x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
+ x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
+ float f;
+ _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
+ return f;
+}
+
+} // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_float.h"
+
+namespace gmx
+{
+
+class SimdDouble
+{
+ public:
+ SimdDouble() {}
+
+ SimdDouble(double d) : simdInternal_(_mm512_set1_pd(d)) {}
+
+ // Internal utility constructor to simplify return statements
+ SimdDouble(__m512d simd) : simdInternal_(simd) {}
+
+ __m512d simdInternal_;
+};
+
+class SimdDInt32
+{
+ public:
+ SimdDInt32() {}
+
+ SimdDInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
+
+ // Internal utility constructor to simplify return statements
+ SimdDInt32(__m512i simd) : simdInternal_(simd) {}
+
+ __m512i simdInternal_;
+};
+
+class SimdDBool
+{
+ public:
+ SimdDBool() {}
+
+ // Internal utility constructor to simplify return statements
+ SimdDBool(__mmask8 simd) : simdInternal_(simd) {}
+
+ __mmask8 simdInternal_;
+};
+
+class SimdDIBool
+{
+ public:
+ SimdDIBool() {}
+
+ // Internal utility constructor to simplify return statements
+ SimdDIBool(__mmask16 simd) : simdInternal_(simd) {}
+
+ __mmask16 simdInternal_;
+};
+
+static inline SimdDouble gmx_simdcall
+load(const double *m)
+{
+ assert(std::size_t(m) % 64 == 0);
+ return {
+ _mm512_load_pd(m)
+ };
+}
+
+static inline void gmx_simdcall
+store(double *m, SimdDouble a)
+{
+ assert(std::size_t(m) % 64 == 0);
+ _mm512_store_pd(m, a.simdInternal_);
+}
+
+static inline SimdDouble gmx_simdcall
+loadU(const double *m)
+{
+ return {
+ _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8)
+ };
+}
+
+static inline void gmx_simdcall
+storeU(double *m, SimdDouble a)
+{
+ _mm512_packstorelo_pd(m, a.simdInternal_);
+ _mm512_packstorehi_pd(m+8, a.simdInternal_);
+
+}
+
+static inline SimdDouble gmx_simdcall
+setZeroD()
+{
+ return {
+ _mm512_setzero_pd()
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+loadDI(const std::int32_t * m)
+{
+ assert(std::size_t(m) % 32 == 0);
+ return {
+ _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
+ };
+}
+
+static inline void gmx_simdcall
+store(std::int32_t * m, SimdDInt32 a)
+{
+ assert(std::size_t(m) % 32 == 0);
+ _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
+}
+
+static inline SimdDInt32 gmx_simdcall
+loadUDI(const std::int32_t *m)
+{
+ return {
+ _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), m),
+ _mm512_int2mask(0x00FF), m+16)
+ };
+}
+
+static inline void gmx_simdcall
+storeU(std::int32_t * m, SimdDInt32 a)
+{
+ _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
+ _mm512_mask_packstorehi_epi32(m+16, _mm512_int2mask(0x00FF), a.simdInternal_);
+}
+
+static inline SimdDInt32 gmx_simdcall
+setZeroDI()
+{
+ return {
+ _mm512_setzero_epi32()
+ };
+}
+
+template<int index>
+static inline std::int32_t gmx_simdcall
+extract(SimdDInt32 a)
+{
+ int r;
+ _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a.simdInternal_);
+ return r;
+}
+
+static inline SimdDouble gmx_simdcall
+operator&(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+andNot(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+operator|(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+operator^(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+operator+(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_add_pd(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+operator-(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_sub_pd(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+operator-(SimdDouble x)
+{
+ return {
+ _mm512_addn_pd(x.simdInternal_, _mm512_setzero_pd())
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+operator*(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_mul_pd(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+fma(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+ return {
+ _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+fms(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+ return {
+ _mm512_fmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+fnma(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+ return {
+ _mm512_fnmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+fnms(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+ return {
+ _mm512_fnmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+rsqrt(SimdDouble x)
+{
+ return {
+ _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+rcp(SimdDouble x)
+{
+ return {
+ _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
+{
+ return {
+ _mm512_mask_add_pd(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
+{
+ return {
+ _mm512_mask_mul_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
+{
+ return {
+ _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzRsqrt(SimdDouble x, SimdDBool m)
+{
+ return {
+ _mm512_cvtpslo_pd(_mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzRcp(SimdDouble x, SimdDBool m)
+{
+ return {
+ _mm512_cvtpslo_pd(_mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+abs(SimdDouble x)
+{
+ return {
+ _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)), _mm512_castpd_si512(x.simdInternal_)))
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+max(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_gmax_pd(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+min(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_gmin_pd(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+round(SimdDouble x)
+{
+ return {
+ _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+trunc(SimdDouble x)
+{
+ return {
+ _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline SimdDouble
+frexp(SimdDouble value, SimdDInt32 * exponent)
+{
+ __m512d rExponent = _mm512_getexp_pd(value.simdInternal_);
+ __m512i iExponent = _mm512_cvtfxpnt_roundpd_epi32lo(rExponent, _MM_FROUND_TO_NEAREST_INT);
+
+ exponent->simdInternal_ = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
+
+ return {
+ _mm512_getmant_pd(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src)
+ };
+}
+
+static inline SimdDouble
+ldexp(SimdDouble value, SimdDInt32 exponent)
+{
+ const __m512i exponentBias = _mm512_set1_epi32(1023);
+ __m512i iExponent;
+
+ iExponent = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), exponent.simdInternal_);
+ iExponent = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iExponent, exponentBias), 20);
+ return _mm512_mul_pd(_mm512_castsi512_pd(iExponent), value.simdInternal_);
+}
+
+static inline double gmx_simdcall
+reduce(SimdDouble a)
+{
+ return _mm512_reduce_add_pd(a.simdInternal_);
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline SimdDBool gmx_simdcall
+operator==(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+ };
+}
+
+static inline SimdDBool gmx_simdcall
+operator!=(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+ };
+}
+
+static inline SimdDBool gmx_simdcall
+operator<(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+ };
+}
+
+static inline SimdDBool gmx_simdcall
+operator<=(SimdDouble a, SimdDouble b)
+{
+ return {
+ _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+ };
+}
+
+static inline SimdDBool gmx_simdcall
+testBits(SimdDouble a)
+{
+ // This is a bit problematic since Knight's corner does not have any 64-bit integer comparisons,
+ // and we cannot use floating-point since values with just a single bit set can evaluate to 0.0.
+ // Instead, we do it as
+ // 1) Do a logical or of the high/low 32 bits
+ // 2) Do a permute so we have the low 32 bits of each value in the low 8 32-bit elements
+ // 3) Do an integer comparison, and cast so we just keep the low 8 bits of the mask.
+ //
+ // By default we will use integers for the masks in the nonbonded kernels, so this shouldn't
+ // have any significant performance drawbacks.
+
+ __m512i ia = _mm512_castpd_si512(a.simdInternal_);
+
+ ia = _mm512_or_epi32(ia, _mm512_swizzle_epi32(ia, _MM_SWIZ_REG_CDAB));
+ ia = _mm512_permutevar_epi32( _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), ia);
+
+ return {
+ static_cast<__mmask8>(_mm512_cmp_epi32_mask(ia, _mm512_setzero_si512(), _MM_CMPINT_NE))
+ };
+}
+
+static inline SimdDBool gmx_simdcall
+operator&&(SimdDBool a, SimdDBool b)
+{
+ return {
+ static_cast<__mmask8>(_mm512_kand(a.simdInternal_, b.simdInternal_))
+ };
+}
+
+static inline SimdDBool gmx_simdcall
+operator||(SimdDBool a, SimdDBool b)
+{
+ return {
+ static_cast<__mmask8>(_mm512_kor(a.simdInternal_, b.simdInternal_))
+ };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdDBool a)
+{
+ return _mm512_mask2int(a.simdInternal_) != 0;
+}
+
+static inline SimdDouble gmx_simdcall
+selectByMask(SimdDouble a, SimdDBool m)
+{
+ return {
+ _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+selectByNotMask(SimdDouble a, SimdDBool m)
+{
+ return {
+ _mm512_mask_mov_pd(a.simdInternal_, m.simdInternal_, _mm512_setzero_pd())
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+blend(SimdDouble a, SimdDouble b, SimdDBool sel)
+{
+ return {
+ _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator<<(SimdDInt32 a, int n)
+{
+ return {
+ _mm512_slli_epi32(a.simdInternal_, n)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator>>(SimdDInt32 a, int n)
+{
+ return {
+ _mm512_srli_epi32(a.simdInternal_, n)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator&(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_and_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+andNot(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator|(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_or_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator^(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_xor_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator+(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_add_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator-(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_sub_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator*(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator==(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ)
+ };
+}
+
+static inline SimdDIBool gmx_simdcall
+testBits(SimdDInt32 a)
+{
+ return {
+ _mm512_cmp_epi32_mask(a.simdInternal_, _mm512_setzero_si512(), _MM_CMPINT_NE)
+ };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator<(SimdDInt32 a, SimdDInt32 b)
+{
+ return {
+ _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT)
+ };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator&&(SimdDIBool a, SimdDIBool b)
+{
+ return {
+ _mm512_kand(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator||(SimdDIBool a, SimdDIBool b)
+{
+ return {
+ _mm512_kor(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdDIBool a)
+{
+ return ( _mm512_mask2int(a.simdInternal_) & 0xFF) != 0;
+}
+
+static inline SimdDInt32 gmx_simdcall
+selectByMask(SimdDInt32 a, SimdDIBool m)
+{
+ return {
+ _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+selectByNotMask(SimdDInt32 a, SimdDIBool m)
+{
+ return {
+ _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32())
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
+{
+ return {
+ _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+cvtR2I(SimdDouble a)
+{
+ return {
+ _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT)
+ };
+}
+
+static inline SimdDInt32 gmx_simdcall
+cvttR2I(SimdDouble a)
+{
+ return {
+ _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_ZERO)
+ };
+}
+
+static inline SimdDouble gmx_simdcall
+cvtI2R(SimdDInt32 a)
+{
+ return {
+ _mm512_cvtepi32lo_pd(a.simdInternal_)
+ };
+}
+
+static inline SimdDIBool gmx_simdcall
+cvtB2IB(SimdDBool a)
+{
+ return {
+ a.simdInternal_
+ };
+}
+
+static inline SimdDBool gmx_simdcall
+cvtIB2B(SimdDIBool a)
+{
+ return {
+ static_cast<__mmask8>(a.simdInternal_)
+ };
+}
+
+static inline void gmx_simdcall
+cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
+{
+ __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f.simdInternal_), _MM_PERM_DCDC);
+
+ *d0 = _mm512_cvtpslo_pd(f.simdInternal_);
+ *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
+}
+
+static inline SimdFloat gmx_simdcall
+cvtDD2F(SimdDouble d0, SimdDouble d1)
+{
+ __m512 f0 = _mm512_cvtpd_pslo(d0.simdInternal_);
+ __m512 f1 = _mm512_cvtpd_pslo(d1.simdInternal_);
+ return {
+ _mm512_mask_permute4f128_ps(f0, _mm512_int2mask(0xFF00), f1, _MM_PERM_BABA)
+ };
+}
+
+} // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
+#define GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+namespace gmx
+{
+
+class SimdFloat
+{
+ public:
+ SimdFloat() {}
+
+ SimdFloat(float f) : simdInternal_(_mm512_set1_ps(f)) {}
+
+ // Internal utility constructor to simplify return statements
+ SimdFloat(__m512 simd) : simdInternal_(simd) {}
+
+ __m512 simdInternal_;
+};
+
+class SimdFInt32
+{
+ public:
+ SimdFInt32() {}
+
+ SimdFInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
+
+ // Internal utility constructor to simplify return statements
+ SimdFInt32(__m512i simd) : simdInternal_(simd) {}
+
+ __m512i simdInternal_;
+};
+
+class SimdFBool
+{
+ public:
+ SimdFBool() {}
+
+ SimdFBool(bool b) : simdInternal_(_mm512_int2mask( b ? 0xFFFF : 0)) {}
+
+ // Internal utility constructor to simplify return statements
+ SimdFBool(__mmask16 simd) : simdInternal_(simd) {}
+
+ __mmask16 simdInternal_;
+};
+
+class SimdFIBool
+{
+ public:
+ SimdFIBool() {}
+
+ SimdFIBool(bool b) : simdInternal_(_mm512_int2mask( b ? 0xFFFF : 0)) {}
+
+ // Internal utility constructor to simplify return statements
+ SimdFIBool(__mmask16 simd) : simdInternal_(simd) {}
+
+ __mmask16 simdInternal_;
+};
+
+static inline SimdFloat gmx_simdcall
+load(const float *m)
+{
+ assert(std::size_t(m) % 64 == 0);
+ return {
+ _mm512_load_ps(m)
+ };
+}
+
+static inline void gmx_simdcall
+store(float *m, SimdFloat a)
+{
+ assert(std::size_t(m) % 64 == 0);
+ _mm512_store_ps(m, a.simdInternal_);
+}
+
+static inline SimdFloat gmx_simdcall
+loadU(const float *m)
+{
+ return {
+ _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16)
+ };
+}
+
+static inline void gmx_simdcall
+storeU(float *m, SimdFloat a)
+{
+ _mm512_packstorelo_ps(m, a.simdInternal_);
+ _mm512_packstorehi_ps(m+16, a.simdInternal_);
+}
+
+static inline SimdFloat gmx_simdcall
+setZeroF()
+{
+ return {
+ _mm512_setzero_ps()
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+loadFI(const std::int32_t * m)
+{
+ assert(std::size_t(m) % 64 == 0);
+ return {
+ _mm512_load_epi32(m)
+ };
+}
+
+static inline void gmx_simdcall
+store(std::int32_t * m, SimdFInt32 a)
+{
+ assert(std::size_t(m) % 64 == 0);
+ _mm512_store_epi32(m, a.simdInternal_);
+}
+
+static inline SimdFInt32 gmx_simdcall
+loadUFI(const std::int32_t *m)
+{
+ return {
+ _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16)
+ };
+}
+
+static inline void gmx_simdcall
+storeU(std::int32_t * m, SimdFInt32 a)
+{
+ _mm512_packstorelo_epi32(m, a.simdInternal_);
+ _mm512_packstorehi_epi32(m+16, a.simdInternal_);
+}
+
+static inline SimdFInt32 gmx_simdcall
+setZeroFI()
+{
+ return {
+ _mm512_setzero_si512()
+ };
+}
+
+
+template<int index>
+static inline std::int32_t gmx_simdcall
+extract(SimdFInt32 a)
+{
+ int r;
+ _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a.simdInternal_);
+ return r;
+}
+
+static inline SimdFloat gmx_simdcall
+operator&(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+andNot(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+operator|(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+operator^(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(b.simdInternal_)))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+operator+(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_add_ps(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+operator-(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_sub_ps(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+operator-(SimdFloat x)
+{
+ return {
+ _mm512_addn_ps(x.simdInternal_, _mm512_setzero_ps())
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+operator*(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_mul_ps(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+fma(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+ return {
+ _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+fms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+ return {
+ _mm512_fmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+fnma(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+ return {
+ _mm512_fnmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+fnms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+ return {
+ _mm512_fnmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+rsqrt(SimdFloat x)
+{
+ return {
+ _mm512_rsqrt23_ps(x.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+rcp(SimdFloat x)
+{
+ return {
+ _mm512_rcp23_ps(x.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
+{
+ return {
+ _mm512_mask_add_ps(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
+{
+ return {
+ _mm512_mask_mul_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
+{
+ return {
+ _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzRsqrt(SimdFloat x, SimdFBool m)
+{
+ return {
+ _mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzRcp(SimdFloat x, SimdFBool m)
+{
+ return {
+ _mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+abs(SimdFloat x)
+{
+ return {
+ _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)), _mm512_castps_si512(x.simdInternal_)))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+max(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_gmax_ps(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+min(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_gmin_ps(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+round(SimdFloat x)
+{
+ return {
+ _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+trunc(SimdFloat x)
+{
+ return {
+ _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+frexp(SimdFloat value, SimdFInt32 * exponent)
+{
+ __m512 rExponent = _mm512_getexp_ps(value.simdInternal_);
+ __m512i iExponent = _mm512_cvtfxpnt_round_adjustps_epi32(rExponent, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
+
+ exponent->simdInternal_ = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
+
+ return {
+ _mm512_getmant_ps(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+ldexp(SimdFloat value, SimdFInt32 exponent)
+{
+ const __m512i exponentBias = _mm512_set1_epi32(127);
+ __m512i iExponent;
+
+ iExponent = _mm512_slli_epi32( _mm512_add_epi32(exponent.simdInternal_, exponentBias), 23);
+
+ return {
+ _mm512_mul_ps(value.simdInternal_, _mm512_castsi512_ps(iExponent))
+ };
+}
+
+static inline float gmx_simdcall
+reduce(SimdFloat a)
+{
+ return _mm512_reduce_add_ps(a.simdInternal_);
+}
+
+// Picky, picky, picky:
+// icc-16 complains about "Illegal value of immediate argument to intrinsic"
+// unless we use
+// 1) Ordered-quiet for ==
+// 2) Unordered-quiet for !=
+// 3) Ordered-signaling for < and <=
+
+static inline SimdFBool gmx_simdcall
+operator==(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
+ };
+}
+
+static inline SimdFBool gmx_simdcall
+operator!=(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
+ };
+}
+
+static inline SimdFBool gmx_simdcall
+operator<(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
+ };
+}
+
+static inline SimdFBool gmx_simdcall
+operator<=(SimdFloat a, SimdFloat b)
+{
+ return {
+ _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
+ };
+}
+
+static inline SimdFBool gmx_simdcall
+testBits(SimdFloat a)
+{
+ return {
+ _mm512_test_epi32_mask( _mm512_castps_si512(a.simdInternal_), _mm512_castps_si512(a.simdInternal_) )
+ };
+}
+
+static inline SimdFBool gmx_simdcall
+operator&&(SimdFBool a, SimdFBool b)
+{
+ return {
+ _mm512_kand(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFBool gmx_simdcall
+operator||(SimdFBool a, SimdFBool b)
+{
+ return {
+ _mm512_kor(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdFBool a)
+{
+ return _mm512_mask2int(a.simdInternal_) != 0;
+}
+
+static inline SimdFloat gmx_simdcall
+selectByMask(SimdFloat a, SimdFBool m)
+{
+ return {
+ _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+selectByNotMask(SimdFloat a, SimdFBool m)
+{
+ return {
+ _mm512_mask_mov_ps(a.simdInternal_, m.simdInternal_, _mm512_setzero_ps())
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+blend(SimdFloat a, SimdFloat b, SimdFBool sel)
+{
+ return {
+ _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator<<(SimdFInt32 a, int n)
+{
+ return {
+ _mm512_slli_epi32(a.simdInternal_, n)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator>>(SimdFInt32 a, int n)
+{
+ return {
+ _mm512_srli_epi32(a.simdInternal_, n)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator&(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_and_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+andNot(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator|(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_or_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator^(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_xor_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator+(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_add_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator-(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_sub_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator*(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator==(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ)
+ };
+}
+
+static inline SimdFIBool gmx_simdcall
+testBits(SimdFInt32 a)
+{
+ return {
+ _mm512_test_epi32_mask( a.simdInternal_, a.simdInternal_ )
+ };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator<(SimdFInt32 a, SimdFInt32 b)
+{
+ return {
+ _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT)
+ };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator&&(SimdFIBool a, SimdFIBool b)
+{
+ return {
+ _mm512_kand(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator||(SimdFIBool a, SimdFIBool b)
+{
+ return {
+ _mm512_kor(a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdFIBool a)
+{
+ return _mm512_mask2int(a.simdInternal_) != 0;
+}
+
+static inline SimdFInt32 gmx_simdcall
+selectByMask(SimdFInt32 a, SimdFIBool m)
+{
+ return {
+ _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+selectByNotMask(SimdFInt32 a, SimdFIBool m)
+{
+ return {
+ _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32())
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
+{
+ return {
+ _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+cvtR2I(SimdFloat a)
+{
+ return {
+ _mm512_cvtfxpnt_round_adjustps_epi32(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline SimdFInt32 gmx_simdcall
+cvttR2I(SimdFloat a)
+{
+ return {
+ _mm512_cvtfxpnt_round_adjustps_epi32(a.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+cvtI2R(SimdFInt32 a)
+{
+ return {
+ _mm512_cvtfxpnt_round_adjustepi32_ps(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+ };
+}
+
+static inline SimdFIBool gmx_simdcall
+cvtB2IB(SimdFBool a)
+{
+ return {
+ a.simdInternal_
+ };
+}
+
+static inline SimdFBool gmx_simdcall
+cvtIB2B(SimdFIBool a)
+{
+ return {
+ a.simdInternal_
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+exp2(SimdFloat x)
+{
+ return {
+ _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x.simdInternal_, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+exp(SimdFloat x)
+{
+ const __m512 argscale = _mm512_set1_ps(1.44269504088896341f);
+ const __m512 invargscale = _mm512_set1_ps(-0.69314718055994528623f);
+
+ __m512 xscaled = _mm512_mul_ps(x.simdInternal_, argscale);
+ __m512 r = _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(xscaled, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
+
+ // exp2a23_ps provides 23 bits of accuracy, but we ruin some of that with our argument
+ // scaling. To correct this, we find the difference between the scaled argument and
+ // the true one (extended precision arithmetics does not appear to be necessary to
+ // fulfill our accuracy requirements) and then multiply by the exponent of this
+ // correction since exp(a+b)=exp(a)*exp(b).
+ // Note that this only adds two instructions (and maybe some constant loads).
+
+ // find the difference
+ x = _mm512_fmadd_ps(invargscale, xscaled, x.simdInternal_);
+ // x will now be a _very_ small number, so approximate exp(x)=1+x.
+ // We should thus apply the correction as r'=r*(1+x)=r+r*x
+ r = _mm512_fmadd_ps(r, x.simdInternal_, r);
+ return {
+ r
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+log(SimdFloat x)
+{
+ return {
+ _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764f), _mm512_log2ae23_ps(x.simdInternal_))
+ };
+}
+
+} // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
+#define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_double.h"
+
+namespace gmx
+{
+
+// On MIC it is better to use scatter operations, so we define the load routines
+// that use a SIMD offset variable first.
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const double * base,
+ SimdDInt32 simdoffset,
+ SimdDouble * v0,
+ SimdDouble * v1,
+ SimdDouble * v2,
+ SimdDouble * v3)
+{
+ assert((size_t)base % 32 == 0);
+ assert(align % 4 == 0);
+
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdDInt32(align);
+ }
+
+ v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
+ v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
+ v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
+ v3->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+3, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUBySimdIntTranspose(const double * base,
+ SimdDInt32 simdoffset,
+ SimdDouble * v0,
+ SimdDouble * v1)
+{
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ if (align == 2)
+ {
+ simdoffset = simdoffset << 1;
+ }
+ else if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdDInt32(align);
+ }
+
+ v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
+ v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const double * base,
+ SimdDInt32 simdoffset,
+ SimdDouble * v0,
+ SimdDouble * v1)
+{
+ assert(std::size_t(base) % 16 == 0);
+ assert(align % 2 == 0);
+ gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
+}
+
+
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const double * base,
+ const std::int32_t offset[],
+ SimdDouble * v0,
+ SimdDouble * v1,
+ SimdDouble * v2,
+ SimdDouble * v3)
+{
+ gatherLoadBySimdIntTranspose<align>(base, loadDI(offset), v0, v1, v2, v3);
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const double * base,
+ const std::int32_t offset[],
+ SimdDouble * v0,
+ SimdDouble * v1)
+{
+ gatherLoadBySimdIntTranspose<align>(base, loadDI(offset), v0, v1);
+}
+
+static const int c_simdBestPairAlignmentDouble = 2;
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTranspose(const double * base,
+ const std::int32_t offset[],
+ SimdDouble * v0,
+ SimdDouble * v1,
+ SimdDouble * v2)
+{
+ SimdDInt32 simdoffset;
+
+ assert(std::size_t(offset) % 32 == 0);
+
+ simdoffset = loadDI(offset);
+
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdDInt32(align);
+ }
+
+ v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
+ v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+1, sizeof(double));
+ v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base+2, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterStoreU(double * base,
+ const std::int32_t offset[],
+ SimdDouble v0,
+ SimdDouble v1,
+ SimdDouble v2)
+{
+ SimdDInt32 simdoffset;
+
+ assert(std::size_t(offset) % 32 == 0);
+
+ simdoffset = loadDI(offset);
+
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdDInt32(align);
+ }
+
+ _mm512_i32loscatter_pd(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(double));
+ _mm512_i32loscatter_pd(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double));
+ _mm512_i32loscatter_pd(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double));
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterIncrU(double * base,
+ const std::int32_t offset[],
+ SimdDouble v0,
+ SimdDouble v1,
+ SimdDouble v2)
+{
+ GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata0[GMX_SIMD_DOUBLE_WIDTH];
+ GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata1[GMX_SIMD_DOUBLE_WIDTH];
+ GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata2[GMX_SIMD_DOUBLE_WIDTH];
+
+ store(rdata0, v0);
+ store(rdata1, v1);
+ store(rdata2, v2);
+
+ for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ base[ align * offset[i] + 0] += rdata0[i];
+ base[ align * offset[i] + 1] += rdata1[i];
+ base[ align * offset[i] + 2] += rdata2[i];
+ }
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterDecrU(double * base,
+ const std::int32_t offset[],
+ SimdDouble v0,
+ SimdDouble v1,
+ SimdDouble v2)
+{
+ GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata0[GMX_SIMD_DOUBLE_WIDTH];
+ GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata1[GMX_SIMD_DOUBLE_WIDTH];
+ GMX_ALIGNED(double, GMX_SIMD_DOUBLE_WIDTH) rdata2[GMX_SIMD_DOUBLE_WIDTH];
+
+ store(rdata0, v0);
+ store(rdata1, v1);
+ store(rdata2, v2);
+
+ for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ base[ align * offset[i] + 0] -= rdata0[i];
+ base[ align * offset[i] + 1] -= rdata1[i];
+ base[ align * offset[i] + 2] -= rdata2[i];
+ }
+}
+
+static inline void gmx_simdcall
+expandScalarsToTriplets(SimdDouble scalar,
+ SimdDouble * triplets0,
+ SimdDouble * triplets1,
+ SimdDouble * triplets2)
+{
+ triplets0->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
+ _mm512_castpd_si512(scalar.simdInternal_)));
+ triplets1->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
+ _mm512_castpd_si512(scalar.simdInternal_)));
+ triplets2->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
+ _mm512_castpd_si512(scalar.simdInternal_)));
+}
+
+
+static inline double gmx_simdcall
+reduceIncr4ReturnSum(double * m,
+ SimdDouble v0,
+ SimdDouble v1,
+ SimdDouble v2,
+ SimdDouble v3)
+{
+ double d;
+ __m512d t0, t1, t2, t3;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ t0 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0.simdInternal_, v2.simdInternal_), _MM_SWIZ_REG_BADC);
+ t2 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2.simdInternal_, v0.simdInternal_);
+ t1 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1.simdInternal_, v3.simdInternal_), _MM_SWIZ_REG_BADC);
+ t3 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3.simdInternal_, v1.simdInternal_);
+ t0 = _mm512_add_pd(t0, t2);
+ t1 = _mm512_add_pd(t1, t3);
+
+ t2 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0, t1), _MM_SWIZ_REG_CDAB);
+ t3 = _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1, t0);
+ t2 = _mm512_add_pd(t2, t3);
+
+ t2 = _mm512_add_pd(t2, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2), _MM_PERM_BADC)));
+
+ t0 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+ t0 = _mm512_add_pd(t0, t2);
+ _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t0);
+
+ t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_BADC));
+ t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_CDAB));
+
+ _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x01), t2);
+ return d;
+}
+
+static inline SimdDouble gmx_simdcall
+loadDualHsimd(const double * m0,
+ const double * m1)
+{
+ assert(std::size_t(m0) % 32 == 0);
+ assert(std::size_t(m1) % 32 == 0);
+
+ return _mm512_mask_extload_pd(_mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
+ m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+}
+
+static inline SimdDouble gmx_simdcall
+loadDuplicateHsimd(const double * m)
+{
+ assert(std::size_t(m) % 32 == 0);
+
+ return _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+}
+
+static inline SimdDouble gmx_simdcall
+load1DualHsimd(const double * m)
+{
+ return _mm512_mask_extload_pd(_mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE), _mm512_int2mask(0xF0),
+ m+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+}
+
+
+static inline void gmx_simdcall
+storeDualHsimd(double * m0,
+ double * m1,
+ SimdDouble a)
+{
+ assert(std::size_t(m0) % 32 == 0);
+ assert(std::size_t(m1) % 32 == 0);
+
+ _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), a.simdInternal_);
+ _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), a.simdInternal_);
+}
+
+static inline void gmx_simdcall
+incrDualHsimd(double * m0,
+ double * m1,
+ SimdDouble a)
+{
+ assert(std::size_t(m0) % 32 == 0);
+ assert(std::size_t(m1) % 32 == 0);
+
+ __m512d x;
+
+ // Update lower half
+ x = _mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+ x = _mm512_add_pd(x, a.simdInternal_);
+ _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), x);
+
+ // Update upper half
+ x = _mm512_extload_pd(m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+ x = _mm512_add_pd(x, a.simdInternal_);
+ _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
+}
+
+static inline void gmx_simdcall
+decrHsimd(double * m,
+ SimdDouble a)
+{
+ __m512d t;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ t = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+ a.simdInternal_ = _mm512_add_pd(a.simdInternal_, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
+ t = _mm512_sub_pd(t, a.simdInternal_);
+ _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTransposeHsimd(const double * base0,
+ const double * base1,
+ const std::int32_t offset[],
+ SimdDouble * v0,
+ SimdDouble * v1)
+{
+ __m512i idx0, idx1, idx;
+ __m512d tmp1, tmp2;
+
+ assert(std::size_t(offset) % 16 == 0);
+ assert(std::size_t(base0) % 16 == 0);
+ assert(std::size_t(base1) % 16 == 0);
+ assert(std::size_t(align) % 2 == 0);
+
+ idx0 = _mm512_extload_epi32(offset, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+
+ idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
+ idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
+
+ idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0x00F0), idx1, _MM_PERM_AAAA);
+
+ tmp1 = _mm512_i32logather_pd(idx, base0, sizeof(double));
+ tmp2 = _mm512_i32logather_pd(idx, base1, sizeof(double));
+
+ v0->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp1), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2), _MM_PERM_BABA));
+ v1->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(tmp2), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1), _MM_PERM_DCDC));
+}
+
+static inline double gmx_simdcall
+reduceIncr4ReturnSumHsimd(double * m,
+ SimdDouble v0,
+ SimdDouble v1)
+{
+ double d;
+ __m512d t0, t1;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ t0 = _mm512_add_pd(v0.simdInternal_, _mm512_swizzle_pd(v0.simdInternal_, _MM_SWIZ_REG_BADC));
+ t0 = _mm512_mask_add_pd(t0, _mm512_int2mask(0xCC), v1.simdInternal_, _mm512_swizzle_pd(v1.simdInternal_, _MM_SWIZ_REG_BADC));
+ t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
+ t0 = _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(t0), _mm512_int2mask(0xCCCC),
+ _mm512_castpd_ps(t0), _MM_PERM_DCDC));
+
+ t1 = _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
+ t1 = _mm512_add_pd(t1, t0);
+ _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t1);
+
+ t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_BADC));
+ t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
+
+ _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x03), t0);
+ return d;
+}
+
+} // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
+#define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include <immintrin.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_x86_mic_simd_float.h"
+
+namespace gmx
+{
+
+// On MIC it is better to use scatter operations, so we define the load routines
+// that use a SIMD offset variable first.
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const float * base,
+ SimdFInt32 simdoffset,
+ SimdFloat * v0,
+ SimdFloat * v1,
+ SimdFloat * v2,
+ SimdFloat * v3)
+{
+ assert(std::size_t(base) % 16 == 0);
+ assert(align % 4 == 0);
+
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdFInt32(align);
+ }
+
+ v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
+ v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
+ v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
+ v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+3, sizeof(float));
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUBySimdIntTranspose(const float * base,
+ SimdFInt32 simdoffset,
+ SimdFloat * v0,
+ SimdFloat * v1)
+{
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ // For align == 2 we can merge the constant into the scale parameter,
+ // which can take constants up to 8 in total.
+ if (align == 2)
+ {
+ v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, align * sizeof(float));
+ v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, align * sizeof(float));
+ }
+ else
+ {
+ if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdFInt32(align);
+ }
+ v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
+ v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
+ }
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const float * base,
+ SimdFInt32 simdoffset,
+ SimdFloat * v0,
+ SimdFloat * v1)
+{
+ assert(std::size_t(base) % 8 == 0);
+ assert(align % 2 == 0);
+ gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const float * base,
+ const std::int32_t offset[],
+ SimdFloat * v0,
+ SimdFloat * v1,
+ SimdFloat * v2,
+ SimdFloat * v3)
+{
+ gatherLoadBySimdIntTranspose<align>(base, loadFI(offset), v0, v1, v2, v3);
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const float * base,
+ const std::int32_t offset[],
+ SimdFloat * v0,
+ SimdFloat * v1)
+{
+ gatherLoadBySimdIntTranspose<align>(base, loadFI(offset), v0, v1);
+}
+
+static const int c_simdBestPairAlignmentFloat = 2;
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTranspose(const float * base,
+ const std::int32_t offset[],
+ SimdFloat * v0,
+ SimdFloat * v1,
+ SimdFloat * v2)
+{
+ SimdFInt32 simdoffset;
+
+ assert(std::size_t(offset) % 64 == 0);
+
+ simdoffset = loadFI(offset);
+
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdFInt32(align);
+ }
+
+ v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
+ v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+1, sizeof(float));
+ v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base+2, sizeof(float));
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterStoreU(float * base,
+ const std::int32_t offset[],
+ SimdFloat v0,
+ SimdFloat v1,
+ SimdFloat v2)
+{
+ SimdFInt32 simdoffset;
+
+ assert(sdt::size_t(offset) % 64 == 0);
+
+ simdoffset = loadFI(offset);
+
+ // All instructions might be latency ~4 on MIC, so we use shifts where we
+ // only need a single instruction (since the shift parameter is an immediate),
+ // but multiplication otherwise.
+ if (align == 4)
+ {
+ simdoffset = simdoffset << 2;
+ }
+ else if (align == 8)
+ {
+ simdoffset = simdoffset << 3;
+ }
+ else
+ {
+ simdoffset = simdoffset * SimdFInt32(align);
+ }
+
+ _mm512_i32scatter_ps(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
+ _mm512_i32scatter_ps(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
+ _mm512_i32scatter_ps(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterIncrU(float * base,
+ const std::int32_t offset[],
+ SimdFloat v0,
+ SimdFloat v1,
+ SimdFloat v2)
+{
+ GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) rdata0[GMX_SIMD_FLOAT_WIDTH];
+ GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) rdata1[GMX_SIMD_FLOAT_WIDTH];
+ GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) rdata2[GMX_SIMD_FLOAT_WIDTH];
+
+ store(rdata0, v0);
+ store(rdata1, v1);
+ store(rdata2, v2);
+
+ for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ base[ align * offset[i] + 0] += rdata0[i];
+ base[ align * offset[i] + 1] += rdata1[i];
+ base[ align * offset[i] + 2] += rdata2[i];
+ }
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterDecrU(float * base,
+ const std::int32_t offset[],
+ SimdFloat v0,
+ SimdFloat v1,
+ SimdFloat v2)
+{
+ GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) rdata0[GMX_SIMD_FLOAT_WIDTH];
+ GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) rdata1[GMX_SIMD_FLOAT_WIDTH];
+ GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) rdata2[GMX_SIMD_FLOAT_WIDTH];
+
+ store(rdata0, v0);
+ store(rdata1, v1);
+ store(rdata2, v2);
+
+ for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ base[ align * offset[i] + 0] -= rdata0[i];
+ base[ align * offset[i] + 1] -= rdata1[i];
+ base[ align * offset[i] + 2] -= rdata2[i];
+ }
+}
+
+static inline void gmx_simdcall
+expandScalarsToTriplets(SimdFloat scalar,
+ SimdFloat * triplets0,
+ SimdFloat * triplets1,
+ SimdFloat * triplets2)
+{
+ triplets0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
+ _mm512_castps_si512(scalar.simdInternal_)));
+ triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
+ _mm512_castps_si512(scalar.simdInternal_)));
+ triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(_mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
+ _mm512_castps_si512(scalar.simdInternal_)));
+}
+
+
+static inline float gmx_simdcall
+reduceIncr4ReturnSum(float * m,
+ SimdFloat v0,
+ SimdFloat v1,
+ SimdFloat v2,
+ SimdFloat v3)
+{
+ float f;
+ __m512 t0, t1, t2, t3;
+
+ assert(std::size_t(m) % 16 == 0);
+
+ t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
+ t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v2.simdInternal_, _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
+ t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
+ t1 = _mm512_mask_add_ps(t1, _mm512_int2mask(0xCCCC), v3.simdInternal_, _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
+ t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
+ t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
+
+ t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
+ t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
+
+ t0 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+ t0 = _mm512_add_ps(t0, t2);
+ _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
+
+ t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_BADC));
+ t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_CDAB));
+
+ _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t2);
+ return f;
+}
+
+static inline SimdFloat gmx_simdcall
+loadDualHsimd(const float * m0,
+ const float * m1)
+{
+ assert(std::size_t(m0) % 32 == 0);
+ assert(std::size_t(m1) % 32 == 0);
+
+ return _mm512_castpd_ps(_mm512_mask_extload_pd(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
+ _mm512_int2mask(0xF0), reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+}
+
+static inline SimdFloat gmx_simdcall
+loadDuplicateHsimd(const float * m)
+{
+ assert(std::size_t(m) % 32 == 0);
+
+ return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+}
+
+static inline SimdFloat gmx_simdcall
+load1DualHsimd(const float * m)
+{
+ return _mm512_mask_extload_ps(_mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), _mm512_int2mask(0xFF00),
+ m+1, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
+}
+
+
+static inline void gmx_simdcall
+storeDualHsimd(float * m0,
+ float * m1,
+ SimdFloat a)
+{
+ __m512 t0;
+
+ assert(std::size_t(m0) % 32 == 0);
+ assert(std::size_t(m1) % 32 == 0);
+
+ _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), a.simdInternal_);
+ _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
+}
+
+static inline void gmx_simdcall
+incrDualHsimd(float * m0,
+ float * m1,
+ SimdFloat a)
+{
+ assert(std::size_t(m0) % 32 == 0);
+ assert(std::size_t(m1) % 32 == 0);
+
+ __m512 x;
+
+ // Update lower half
+ x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+ x = _mm512_add_ps(x, a.simdInternal_);
+ _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
+
+ // Update upper half
+ x = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+ x = _mm512_add_ps(x, a.simdInternal_);
+ _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
+}
+
+static inline void gmx_simdcall
+decrHsimd(float * m,
+ SimdFloat a)
+{
+ __m512 t;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ t = _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double *>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
+ a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
+ t = _mm512_sub_ps(t, a.simdInternal_);
+ _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTransposeHsimd(const float * base0,
+ const float * base1,
+ const std::int32_t offset[],
+ SimdFloat * v0,
+ SimdFloat * v1)
+{
+ __m512i idx0, idx1, idx;
+ __m512 tmp1, tmp2;
+
+ assert(std::size_t(offset) % 32 == 0);
+ assert(std::size_t(base0) % 8 == 0);
+ assert(std::size_t(base1) % 8 == 0);
+ assert(std::size_t(align) % 2 == 0);
+
+ idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset);
+
+ idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
+ idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
+
+ idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0xFF00), idx1, _MM_PERM_BABA);
+
+ tmp1 = _mm512_i32gather_ps(idx, base0, sizeof(float));
+ tmp2 = _mm512_i32gather_ps(idx, base1, sizeof(float));
+
+ v0->simdInternal_ = _mm512_mask_permute4f128_ps(tmp1, _mm512_int2mask(0xFF00), tmp2, _MM_PERM_BABA);
+ v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
+}
+
+static inline float gmx_simdcall
+reduceIncr4ReturnSumHsimd(float * m,
+ SimdFloat v0,
+ SimdFloat v1)
+{
+ float f;
+ __m512 t0, t1;
+
+ assert(std::size_t(m) % 32 == 0);
+
+ t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
+ t0 = _mm512_mask_add_ps(t0, _mm512_int2mask(0xCCCC), v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
+ t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
+ t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
+ t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
+ t1 = _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
+ t1 = _mm512_add_ps(t1, t0);
+ _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
+
+ t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_BADC));
+ t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
+
+ _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t0);
+ return f;
+}
+
+} // namespace gmx
+
+#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
* \{
*/
-// Many SIMD architectures other than reference are temporarily disabled in this commit
-#if GMX_SIMD_X86_AVX2_256
-# include "impl_x86_avx2_256/impl_x86_avx2_256.h"
-#elif GMX_SIMD_X86_AVX_256
-# include "impl_x86_avx_256/impl_x86_avx_256.h"
-#elif GMX_SIMD_X86_AVX_128_FMA
-# include "impl_x86_avx_128_fma/impl_x86_avx_128_fma.h"
+#if GMX_SIMD_X86_SSE2
+# include "impl_x86_sse2/impl_x86_sse2.h"
#elif GMX_SIMD_X86_SSE4_1
# include "impl_x86_sse4_1/impl_x86_sse4_1.h"
-#elif GMX_SIMD_X86_SSE2
-# include "impl_x86_sse2/impl_x86_sse2.h"
-#elif GMX_SIMD_ARM_NEON_ASIMD
-# include "impl_arm_neon_asimd/impl_arm_neon_asimd.h"
+#elif GMX_SIMD_X86_AVX_128_FMA
+# include "impl_x86_avx_128_fma/impl_x86_avx_128_fma.h"
+#elif GMX_SIMD_X86_AVX_256
+# include "impl_x86_avx_256/impl_x86_avx_256.h"
+#elif GMX_SIMD_X86_AVX2_256
+# include "impl_x86_avx2_256/impl_x86_avx2_256.h"
+#elif GMX_SIMD_X86_MIC
+# include "impl_x86_mic/impl_x86_mic.h"
#elif GMX_SIMD_ARM_NEON
# include "impl_arm_neon/impl_arm_neon.h"
+#elif GMX_SIMD_ARM_NEON_ASIMD
+# include "impl_arm_neon_asimd/impl_arm_neon_asimd.h"
#elif GMX_SIMD_IBM_QPX
# include "impl_ibm_qpx/impl_ibm_qpx.h"
#elif GMX_SIMD_IBM_VMX
}
}
+TEST_F(SimdFloatingpointUtilTest, incrDualHsimd)
+{
+ real reference[GMX_SIMD_REAL_WIDTH];
+ SimdReal v0;
+
+ // Create reference values
+ for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ reference[i] = val0_[i] + val2_[i];
+ }
+
+ // Point p to the upper half of val0_
+ real * p = val0_ + GMX_SIMD_REAL_WIDTH / 2;
+
+ v0 = load(val2_);
+ incrDualHsimd(val0_, p, v0);
+
+ for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ EXPECT_EQ(reference[i], val0_[i]);
+ }
+}
+
+TEST_F(SimdFloatingpointUtilTest, incrDualHsimdOverlapping)
+{
+ real reference[GMX_SIMD_REAL_WIDTH/2];
+ SimdReal v0;
+
+ // Create reference values
+ for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
+ {
+ reference[i] = val0_[i] + val2_[i] + val2_[GMX_SIMD_REAL_WIDTH/2+i];
+ }
+
+ v0 = load(val2_);
+ incrDualHsimd(val0_, val0_, v0);
+
+ for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
+ {
+ EXPECT_EQ(reference[i], val0_[i]);
+ }
+}
+
TEST_F(SimdFloatingpointUtilTest, decrHsimd)
{
SimdReal v0;