GMX_SIMD
"SIMD instruction set for CPU kernels and compiler optimization"
"AUTO"
- AUTO None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 AVX2_128 AVX_512 AVX_512_KNL MIC ARM_NEON ARM_NEON_ASIMD ARM_SVE IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
+ AUTO None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 AVX2_128 AVX_512 AVX_512_KNL ARM_NEON ARM_NEON_ASIMD ARM_SVE IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
-if(GMX_TARGET_MIC)
- set(GMX_FFT_LIBRARY_DEFAULT "mkl")
-else()
- set(GMX_FFT_LIBRARY_DEFAULT "fftw3")
-endif()
+set(GMX_FFT_LIBRARY_DEFAULT "fftw3")
gmx_option_multichoice(
GMX_FFT_LIBRARY
+++ /dev/null
-int main()
-{
-#ifdef __MIC__
- return 0;
-#else
-#error This compiler is not targetting MIC
-#endif
-}
# This file is part of the GROMACS molecular simulation package.
#
# Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
-# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# HPC-ACE is always present. In the future we
# should add detection for HPC-ACE2 here.
set(${_suggested_simd} "Sparc64_HPC_ACE")
- elseif(GMX_TARGET_MIC)
- set(${_suggested_simd} "MIC")
else()
gmx_suggest_simd(${_suggested_simd})
endif()
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2013,2014,2016,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2013,2014,2016,2018,2019,2020, by the GROMACS development team.
+# Copyright (c) 2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
try_compile(GMX_TARGET_X86 ${CMAKE_BINARY_DIR}
"${CMAKE_SOURCE_DIR}/cmake/TestX86.cpp")
endif()
- if (NOT DEFINED GMX_TARGET_MIC)
- try_compile(GMX_TARGET_MIC ${CMAKE_BINARY_DIR}
- "${CMAKE_SOURCE_DIR}/cmake/TestMIC.cpp")
- endif()
- if (GMX_TARGET_MIC)
- message(STATUS "The Intel MIC KNC target is deprecated")
- endif()
if (NOT DEFINED GMX_TARGET_FUJITSU_SPARC64)
try_compile(GMX_TARGET_FUJITSU_SPARC64 ${CMAKE_BINARY_DIR}
"${CMAKE_SOURCE_DIR}/cmake/TestFujitsuSparc64.cpp")
# This file is part of the GROMACS molecular simulation package.
#
# Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
-# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions using CXX flags: ${SIMD_AVX2_CXX_FLAGS}")
endif()
-elseif(GMX_SIMD_ACTIVE STREQUAL "MIC")
- # No flags needed. Not testing.
- set(GMX_SIMD_X86_MIC 1)
- set(SIMD_STATUS_MESSAGE "Enabling MIC (Xeon Phi) SIMD instructions without special flags. This SIMD support is deprecated.")
-
elseif(GMX_SIMD_ACTIVE STREQUAL "AVX_512")
gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED
Finally, for some architectures with large or very large SIMD width (e.g. AVX
with 8 elements in single precision, or AVX-512 with 16), the nonbonded
kernels can become inefficient. Since all such architectures presently known
-(AVX, AVX2, MIC, AVX512) also provide extensive support for accessing
+(AVX, AVX2, AVX512) also provide extensive support for accessing
parts of the register, we optionally define a handful of routines to
perform load, store, and reduce operations based on half-SIMD-width data,
which can improve performance. It is only useful for wide implementations,
documentation, installation, and teaching new users.
:issue:`3808`
+
+Removed support for x86 MIC platform
+""""""""""""""""""""""""""""""""""""
+
+This platform is nearly dead and is no longer supported. The KNL
+platform is unaffected by this change.
+
+:issue:`3891`
/* AVX2 128-bit SIMD instruction set level was selected */
#cmakedefine01 GMX_SIMD_X86_AVX2_128
-/* MIC (Xeon Phi) SIMD instruction set level was selected */
-#cmakedefine01 GMX_SIMD_X86_MIC
-
/* AVX-512F foundation level instruction SIMD */
#cmakedefine01 GMX_SIMD_X86_AVX_512
#else
/* older versions of gcc don't support atomic intrinsics */
-#ifndef __MIC__
#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
-#else
-/* MIC is in-order and does not need nor support sfense */
-#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
-#endif
#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2010,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2010,2014,2015,2018,2019,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/* Copy pointer */
rbuf = b->rbuf + b->nreal;
-#if (defined __ICC && __ICC >= 1500 || defined __ICL && __ICL >= 1500) && defined __MIC__
-# pragma novector /* Work-around for incorrect vectorization */
-#endif
for (i = 0; (i < nr); i++)
{
rbuf[i] = r[i];
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012-2018, The GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{
nbat->bUseTreeReduce = (strtol(ptr, nullptr, 10) != 0);
}
-#if defined __MIC__
- else if (nth > 8) /*on the CPU we currently don't benefit even at 32*/
- {
- nbat->bUseTreeReduce = 1;
- }
-#endif
else
{
nbat->bUseTreeReduce = false;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \brief The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
* Currently the 2xNN SIMD kernels only make sense with:
* 8-way SIMD: 4x4 setup, works with AVX-256 in single precision
- * 16-way SIMD: 4x8 setup, works with Intel MIC in single precision
+ * 16-way SIMD: 4x8 setup, not currently in use, but worked with Intel MIC
*/
# if GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8
# define GMX_NBNXN_SIMD_4XN
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_H
-#define GMX_SIMD_IMPL_X86_MIC_H
-
-#include "impl_x86_mic_definitions.h"
-#include "impl_x86_mic_general.h"
-#include "impl_x86_mic_simd4_double.h"
-#include "impl_x86_mic_simd4_float.h"
-#include "impl_x86_mic_simd_double.h"
-#include "impl_x86_mic_simd_float.h"
-#include "impl_x86_mic_util_double.h"
-#include "impl_x86_mic_util_float.h"
-
-#endif // GMX_SIMD_IMPL_X86_MIC_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
-#define GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
-
-#define GMX_SIMD 1
-#define GMX_SIMD_HAVE_FLOAT 1
-#define GMX_SIMD_HAVE_DOUBLE 1
-#define GMX_SIMD_HAVE_LOADU 1
-#define GMX_SIMD_HAVE_STOREU 1
-#define GMX_SIMD_HAVE_LOGICAL 1
-#define GMX_SIMD_HAVE_FMA 1
-#define GMX_SIMD_HAVE_FINT32_EXTRACT 1
-#define GMX_SIMD_HAVE_FINT32_LOGICAL 1
-#define GMX_SIMD_HAVE_FINT32_ARITHMETICS 1
-#define GMX_SIMD_HAVE_DINT32_EXTRACT 1
-#define GMX_SIMD_HAVE_DINT32_LOGICAL 1
-#define GMX_SIMD_HAVE_DINT32_ARITHMETICS 1
-#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT 0
-#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT 0
-#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT 0
-#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT 1
-#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT 1
-#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT 1
-#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0
-#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
-#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
-
-#define GMX_SIMD4_HAVE_FLOAT 1
-#define GMX_SIMD4_HAVE_DOUBLE 1
-
-// Implementation details
-#define GMX_SIMD_FLOAT_WIDTH 16
-#define GMX_SIMD_DOUBLE_WIDTH 8
-#define GMX_SIMD_FINT32_WIDTH 16
-#define GMX_SIMD_DINT32_WIDTH 8
-#define GMX_SIMD4_WIDTH 4
-#define GMX_SIMD_ALIGNMENT 64 // Bytes (16*single or 8*double)
-#define GMX_SIMD_RSQRT_BITS 23
-#define GMX_SIMD_RCP_BITS 23
-
-#endif // GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_GENERAL_H
-#define GMX_SIMD_IMPL_X86_MIC_GENERAL_H
-
-#include <immintrin.h>
-
-namespace gmx
-{
-
-static inline void simdPrefetch(const void* m)
-{
- _mm_prefetch((const char*)m, _MM_HINT_T0);
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_OTHER_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
-
-#include "config.h"
-
-#include <cassert>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_double.h"
-
-namespace gmx
-{
-
-class Simd4Double
-{
-public:
- Simd4Double() {}
-
- Simd4Double(double d) : simdInternal_(_mm512_set1_pd(d)) {}
-
- // Internal utility constructor to simplify return statements
- Simd4Double(__m512d simd) : simdInternal_(simd) {}
-
- __m512d simdInternal_;
-};
-
-class Simd4DBool
-{
-public:
- Simd4DBool() {}
-
- // Internal utility constructor to simplify return statements
- Simd4DBool(__mmask16 simd) : simdInternal_(simd) {}
-
- __mmask16 simdInternal_;
-};
-
-static inline Simd4Double gmx_simdcall load4(const double* m)
-{
- assert(size_t(m) % 32 == 0);
- return { _mm512_mask_extload_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE) };
-}
-
-static inline void gmx_simdcall store4(double* m, Simd4Double a)
-{
- assert(size_t(m) % 32 == 0);
- _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Double gmx_simdcall load4U(const double* m)
-{
- return { _mm512_mask_loadunpackhi_pd(
- _mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m),
- _mm512_int2mask(0xF),
- m + 8) };
-}
-
-static inline void gmx_simdcall store4U(double* m, Simd4Double a)
-{
- _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
- _mm512_mask_packstorehi_pd(m + 8, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Double gmx_simdcall simd4SetZeroD()
-{
- return { _mm512_setzero_pd() };
-}
-
-static inline Simd4Double gmx_simdcall operator&(Simd4Double a, Simd4Double b)
-{
- return { _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0x00FF),
- _mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall andNot(Simd4Double a, Simd4Double b)
-{
- return { _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0x00FF),
- _mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall operator|(Simd4Double a, Simd4Double b)
-{
- return { _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0x00FF),
- _mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall operator^(Simd4Double a, Simd4Double b)
-{
- return { _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0x00FF),
- _mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall operator+(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_add_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall operator-(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_sub_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall operator-(Simd4Double x)
-{
- return { _mm512_mask_addn_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_pd()) };
-}
-
-static inline Simd4Double gmx_simdcall operator*(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_mul_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fma(Simd4Double a, Simd4Double b, Simd4Double c)
-{
- return { _mm512_mask_fmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fms(Simd4Double a, Simd4Double b, Simd4Double c)
-{
- return { _mm512_mask_fmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fnma(Simd4Double a, Simd4Double b, Simd4Double c)
-{
- return { _mm512_mask_fnmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fnms(Simd4Double a, Simd4Double b, Simd4Double c)
-{
- return { _mm512_mask_fnmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall rsqrt(Simd4Double x)
-{
- return { _mm512_mask_cvtpslo_pd(
- _mm512_undefined_pd(),
- _mm512_int2mask(0xF),
- _mm512_mask_rsqrt23_ps(
- _mm512_undefined_ps(),
- _mm512_int2mask(0xF),
- _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall abs(Simd4Double x)
-{
- return { _mm512_castsi512_pd(
- _mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0x00FF),
- _mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)),
- _mm512_castpd_si512(x.simdInternal_)))
-
- };
-}
-
-static inline Simd4Double gmx_simdcall max(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_gmax_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall min(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_gmin_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall round(Simd4Double x)
-{
- return { _mm512_mask_roundfxpnt_adjust_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline Simd4Double gmx_simdcall trunc(Simd4Double x)
-{
- return { _mm512_mask_roundfxpnt_adjust_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-static inline double gmx_simdcall dotProduct(Simd4Double a, Simd4Double b)
-{
- return _mm512_mask_reduce_add_pd(
- _mm512_int2mask(7),
- _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7), a.simdInternal_, b.simdInternal_));
-}
-
-static inline void gmx_simdcall transpose(Simd4Double* v0, Simd4Double* v1, Simd4Double* v2, Simd4Double* v3)
-{
- __m512i t0 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v0->simdInternal_),
- 0xFF00,
- _mm512_castpd_si512(v1->simdInternal_),
- _MM_PERM_BABA);
- __m512i t1 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v2->simdInternal_),
- 0xFF00,
- _mm512_castpd_si512(v3->simdInternal_),
- _MM_PERM_BABA);
-
- t0 = _mm512_permutevar_epi32(
- _mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t0);
- t1 = _mm512_permutevar_epi32(
- _mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t1);
-
- v0->simdInternal_ = _mm512_mask_swizzle_pd(
- _mm512_castsi512_pd(t0), _mm512_int2mask(0xCC), _mm512_castsi512_pd(t1), _MM_SWIZ_REG_BADC);
- v1->simdInternal_ = _mm512_mask_swizzle_pd(
- _mm512_castsi512_pd(t1), _mm512_int2mask(0x33), _mm512_castsi512_pd(t0), _MM_SWIZ_REG_BADC);
-
- v2->simdInternal_ =
- _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v0->simdInternal_), _MM_PERM_DCDC));
- v3->simdInternal_ =
- _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v1->simdInternal_), _MM_PERM_DCDC));
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline Simd4DBool gmx_simdcall operator==(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline Simd4DBool gmx_simdcall operator!=(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline Simd4DBool gmx_simdcall operator<(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline Simd4DBool gmx_simdcall operator<=(Simd4Double a, Simd4Double b)
-{
- return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline Simd4DBool gmx_simdcall operator&&(Simd4DBool a, Simd4DBool b)
-{
- return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4DBool gmx_simdcall operator||(Simd4DBool a, Simd4DBool b)
-{
- return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(Simd4DBool a)
-{
- return (_mm512_mask2int(a.simdInternal_) & 0xF) != 0;
-}
-
-static inline Simd4Double gmx_simdcall selectByMask(Simd4Double a, Simd4DBool m)
-{
- return { _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall selectByNotMask(Simd4Double a, Simd4DBool m)
-{
- return { _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(m.simdInternal_), a.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall blend(Simd4Double a, Simd4Double b, Simd4DBool sel)
-{
- return { _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline double gmx_simdcall reduce(Simd4Double a)
-{
- return _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), a.simdInternal_);
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
-
-#include "config.h"
-
-#include <cassert>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_float.h"
-
-namespace gmx
-{
-
-class Simd4Float
-{
-public:
- Simd4Float() {}
-
- Simd4Float(float f) : simdInternal_(_mm512_set1_ps(f)) {}
-
- // Internal utility constructor to simplify return statements
- Simd4Float(__m512 simd) : simdInternal_(simd) {}
-
- __m512 simdInternal_;
-};
-
-class Simd4FBool
-{
-public:
- Simd4FBool() {}
-
- // Internal utility constructor to simplify return statements
- Simd4FBool(__mmask16 simd) : simdInternal_(simd) {}
-
- __mmask16 simdInternal_;
-};
-
-static inline Simd4Float gmx_simdcall load4(const float* m)
-{
- assert(size_t(m) % 16 == 0);
- return { _mm512_mask_extload_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE) };
-}
-
-static inline void gmx_simdcall store4(float* m, Simd4Float a)
-{
- assert(size_t(m) % 16 == 0);
- _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Float gmx_simdcall load4U(const float* m)
-{
- return { _mm512_mask_loadunpackhi_ps(
- _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m),
- _mm512_int2mask(0xF),
- m + 16) };
-}
-
-static inline void gmx_simdcall store4U(float* m, Simd4Float a)
-{
- _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
- _mm512_mask_packstorehi_ps(m + 16, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Float gmx_simdcall simd4SetZeroF()
-{
- return { _mm512_setzero_ps() };
-}
-
-static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
-{
- return { _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0xF),
- _mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
-{
- return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0xF),
- _mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
-{
- return { _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0xF),
- _mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
-{
- return { _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0xF),
- _mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_add_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_sub_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall operator-(Simd4Float x)
-{
- return { _mm512_mask_addn_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_ps()) };
-}
-
-static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_mul_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
-{
- return { _mm512_mask_fmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fms(Simd4Float a, Simd4Float b, Simd4Float c)
-{
- return { _mm512_mask_fmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fnma(Simd4Float a, Simd4Float b, Simd4Float c)
-{
- return { _mm512_mask_fnmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fnms(Simd4Float a, Simd4Float b, Simd4Float c)
-{
- return { _mm512_mask_fnmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall rsqrt(Simd4Float x)
-{
- return { _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall abs(Simd4Float x)
-{
- return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
- _mm512_int2mask(0xF),
- _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
- _mm512_castps_si512(x.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_gmax_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_gmin_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall round(Simd4Float x)
-{
- return { _mm512_mask_round_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline Simd4Float gmx_simdcall trunc(Simd4Float x)
-{
- return { _mm512_mask_round_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
-{
- __m512 x = _mm512_mask_mul_ps(
- _mm512_setzero_ps(), _mm512_int2mask(0x7), a.simdInternal_, b.simdInternal_);
- x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
- x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
- float f;
- _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
- return f;
-}
-
-static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
-{
- v0->simdInternal_ = _mm512_mask_permute4f128_ps(
- v0->simdInternal_, _mm512_int2mask(0x00F0), v1->simdInternal_, _MM_PERM_AAAA);
- v2->simdInternal_ = _mm512_mask_permute4f128_ps(
- v2->simdInternal_, _mm512_int2mask(0x00F0), v3->simdInternal_, _MM_PERM_AAAA);
- v0->simdInternal_ = _mm512_mask_permute4f128_ps(
- v0->simdInternal_, _mm512_int2mask(0xFF00), v2->simdInternal_, _MM_PERM_BABA);
- v0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
- _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0),
- _mm512_castps_si512(v0->simdInternal_)));
- v1->simdInternal_ = _mm512_mask_permute4f128_ps(
- _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_BBBB);
- v2->simdInternal_ = _mm512_mask_permute4f128_ps(
- _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_CCCC);
- v3->simdInternal_ = _mm512_mask_permute4f128_ps(
- _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_DDDD);
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline Simd4FBool gmx_simdcall operator==(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline Simd4FBool gmx_simdcall operator!=(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline Simd4FBool gmx_simdcall operator<(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline Simd4FBool gmx_simdcall operator<=(Simd4Float a, Simd4Float b)
-{
- return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline Simd4FBool gmx_simdcall operator&&(Simd4FBool a, Simd4FBool b)
-{
- return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4FBool gmx_simdcall operator||(Simd4FBool a, Simd4FBool b)
-{
- return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(Simd4FBool a)
-{
- return (_mm512_mask2int(a.simdInternal_) & 0xF) != 0;
-}
-
-static inline Simd4Float gmx_simdcall selectByMask(Simd4Float a, Simd4FBool m)
-{
- return { _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall selectByNotMask(Simd4Float a, Simd4FBool m)
-{
- return { _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(m.simdInternal_), a.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
-{
- return { _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline float gmx_simdcall reduce(Simd4Float a)
-{
- __m512 x = a.simdInternal_;
- x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
- x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
- float f;
- _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
- return f;
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/math/utilities.h"
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_float.h"
-
-namespace gmx
-{
-
-class SimdDouble
-{
-public:
- SimdDouble() {}
-
- SimdDouble(double d) : simdInternal_(_mm512_set1_pd(d)) {}
-
- // Internal utility constructor to simplify return statements
- SimdDouble(__m512d simd) : simdInternal_(simd) {}
-
- __m512d simdInternal_;
-};
-
-class SimdDInt32
-{
-public:
- SimdDInt32() {}
-
- SimdDInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
-
- // Internal utility constructor to simplify return statements
- SimdDInt32(__m512i simd) : simdInternal_(simd) {}
-
- __m512i simdInternal_;
-};
-
-class SimdDBool
-{
-public:
- SimdDBool() {}
-
- // Internal utility constructor to simplify return statements
- SimdDBool(__mmask8 simd) : simdInternal_(simd) {}
-
- __mmask8 simdInternal_;
-};
-
-class SimdDIBool
-{
-public:
- SimdDIBool() {}
-
- // Internal utility constructor to simplify return statements
- SimdDIBool(__mmask16 simd) : simdInternal_(simd) {}
-
- __mmask16 simdInternal_;
-};
-
-static inline SimdDouble gmx_simdcall simdLoad(const double* m, SimdDoubleTag = {})
-{
- assert(std::size_t(m) % 64 == 0);
- return { _mm512_load_pd(m) };
-}
-
-static inline void gmx_simdcall store(double* m, SimdDouble a)
-{
- assert(std::size_t(m) % 64 == 0);
- _mm512_store_pd(m, a.simdInternal_);
-}
-
-static inline SimdDouble gmx_simdcall simdLoadU(const double* m, SimdDoubleTag = {})
-{
- return { _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m + 8) };
-}
-
-static inline void gmx_simdcall storeU(double* m, SimdDouble a)
-{
- _mm512_packstorelo_pd(m, a.simdInternal_);
- _mm512_packstorehi_pd(m + 8, a.simdInternal_);
-}
-
-static inline SimdDouble gmx_simdcall setZeroD()
-{
- return { _mm512_setzero_pd() };
-}
-
-static inline SimdDInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdDInt32Tag)
-{
- assert(std::size_t(m) % 32 == 0);
- return { _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE) };
-}
-
-static inline void gmx_simdcall store(std::int32_t* m, SimdDInt32 a)
-{
- assert(std::size_t(m) % 32 == 0);
- _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
-}
-
-static inline SimdDInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdDInt32Tag)
-{
- return { _mm512_mask_loadunpackhi_epi32(
- _mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), m),
- _mm512_int2mask(0x00FF),
- m + 16) };
-}
-
-static inline void gmx_simdcall storeU(std::int32_t* m, SimdDInt32 a)
-{
- _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
- _mm512_mask_packstorehi_epi32(m + 16, _mm512_int2mask(0x00FF), a.simdInternal_);
-}
-
-static inline SimdDInt32 gmx_simdcall setZeroDI()
-{
- return { _mm512_setzero_epi32() };
-}
-
-template<int index>
-static inline std::int32_t gmx_simdcall extract(SimdDInt32 a)
-{
- int r;
- _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1 << index), a.simdInternal_);
- return r;
-}
-
-static inline SimdDouble gmx_simdcall operator&(SimdDouble a, SimdDouble b)
-{
- return { _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall andNot(SimdDouble a, SimdDouble b)
-{
- return { _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall operator|(SimdDouble a, SimdDouble b)
-{
- return { _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall operator^(SimdDouble a, SimdDouble b)
-{
- return { _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a.simdInternal_),
- _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall operator+(SimdDouble a, SimdDouble b)
-{
- return { _mm512_add_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall operator-(SimdDouble a, SimdDouble b)
-{
- return { _mm512_sub_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall operator-(SimdDouble x)
-{
- return { _mm512_addn_pd(x.simdInternal_, _mm512_setzero_pd()) };
-}
-
-static inline SimdDouble gmx_simdcall operator*(SimdDouble a, SimdDouble b)
-{
- return { _mm512_mul_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fma(SimdDouble a, SimdDouble b, SimdDouble c)
-{
- return { _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fms(SimdDouble a, SimdDouble b, SimdDouble c)
-{
- return { _mm512_fmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fnma(SimdDouble a, SimdDouble b, SimdDouble c)
-{
- return { _mm512_fnmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fnms(SimdDouble a, SimdDouble b, SimdDouble c)
-{
- return { _mm512_fnmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall rsqrt(SimdDouble x)
-{
- return { _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall rcp(SimdDouble x)
-{
- return { _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
-{
- return { _mm512_mask_add_pd(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
-{
- return { _mm512_mask_mul_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
-{
- return { _mm512_mask_mov_pd(_mm512_setzero_pd(),
- m.simdInternal_,
- _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)) };
-}
-
-static inline SimdDouble gmx_simdcall maskzRsqrt(SimdDouble x, SimdDBool m)
-{
- return { _mm512_cvtpslo_pd(_mm512_mask_rsqrt23_ps(
- _mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall maskzRcp(SimdDouble x, SimdDBool m)
-{
- return { _mm512_cvtpslo_pd(_mm512_mask_rcp23_ps(
- _mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall abs(SimdDouble x)
-{
- return { _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)),
- _mm512_castpd_si512(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall max(SimdDouble a, SimdDouble b)
-{
- return { _mm512_gmax_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall min(SimdDouble a, SimdDouble b)
-{
- return { _mm512_gmin_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall round(SimdDouble x)
-{
- return { _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdDouble gmx_simdcall trunc(SimdDouble x)
-{
- return { _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdDouble frexp(SimdDouble value, SimdDInt32* exponent)
-{
- __m512d rExponent;
- __m512i iExponent;
- __m512d result;
-
- if (opt == MathOptimization::Safe)
- {
- // For the safe branch, we use the masked operations to only assign results if the
- // input value was nonzero, and otherwise set exponent to 0, and the fraction to the input (+-0).
- __mmask8 valueIsNonZero =
- _mm512_cmp_pd_mask(_mm512_setzero_pd(), value.simdInternal_, _CMP_NEQ_OQ);
- rExponent = _mm512_mask_getexp_pd(_mm512_setzero_pd(), valueIsNonZero, value.simdInternal_);
-
- // Create an integer -1 value, and use masking in the conversion as the result for
- // zero-value input. When we later add 1 to all fields, the fields that were formerly -1
- // (corresponding to zero exponent) will be assigned -1 + 1 = 0.
- iExponent = _mm512_mask_cvtfxpnt_roundpd_epi32lo(
- _mm512_set_epi32(-1), valueIsNonZero, rExponent, _MM_FROUND_TO_NEAREST_INT);
- iExponent = _mm512__add_epi32(iExponent, _mm512_set1_epi32(1));
-
- // Set result to value (+-0) when it is zero.
- result = _mm512_mask_getmant_pd(
- value.simdInternal_, valueIsNonZero, value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
- }
- else
- {
- rExponent = _mm512_getexp_pd(value.simdInternal_);
- iExponent = _mm512_cvtfxpnt_roundpd_epi32lo(rExponent, _MM_FROUND_TO_NEAREST_INT);
- iExponent = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
- result = _mm512_getmant_pd(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
- }
-
- exponent->simdInternal_ = iExponent;
-
- return { result };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdDouble ldexp(SimdDouble value, SimdDInt32 exponent)
-{
- const __m512i exponentBias = _mm512_set1_epi32(1023);
- __m512i iExponent = _mm512_add_epi32(exponent.simdInternal_, exponentBias);
-
- if (opt == MathOptimization::Safe)
- {
- // Make sure biased argument is not negative
- iExponent = _mm512_max_epi32(iExponent, _mm512_setzero_epi32());
- }
-
- iExponent = _mm512_permutevar_epi32(
- _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), iExponent);
- iExponent = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), iExponent, 20);
- return _mm512_mul_pd(_mm512_castsi512_pd(iExponent), value.simdInternal_);
-}
-
-static inline double gmx_simdcall reduce(SimdDouble a)
-{
- return _mm512_reduce_add_pd(a.simdInternal_);
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline SimdDBool gmx_simdcall operator==(SimdDouble a, SimdDouble b)
-{
- return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline SimdDBool gmx_simdcall operator!=(SimdDouble a, SimdDouble b)
-{
- return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline SimdDBool gmx_simdcall operator<(SimdDouble a, SimdDouble b)
-{
- return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline SimdDBool gmx_simdcall operator<=(SimdDouble a, SimdDouble b)
-{
- return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline SimdDBool gmx_simdcall testBits(SimdDouble a)
-{
- // This is a bit problematic since Knight's corner does not have any 64-bit integer comparisons,
- // and we cannot use floating-point since values with just a single bit set can evaluate to 0.0.
- // Instead, we do it as
- // 1) Do a logical or of the high/low 32 bits
- // 2) Do a permute so we have the low 32 bits of each value in the low 8 32-bit elements
- // 3) Do an integer comparison, and cast so we just keep the low 8 bits of the mask.
- //
- // By default we will use integers for the masks in the nonbonded kernels, so this shouldn't
- // have any significant performance drawbacks.
-
- __m512i ia = _mm512_castpd_si512(a.simdInternal_);
-
- ia = _mm512_or_epi32(ia, _mm512_swizzle_epi32(ia, _MM_SWIZ_REG_CDAB));
- ia = _mm512_permutevar_epi32(
- _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), ia);
-
- return { static_cast<__mmask8>(_mm512_cmp_epi32_mask(ia, _mm512_setzero_si512(), _MM_CMPINT_NE)) };
-}
-
-static inline SimdDBool gmx_simdcall operator&&(SimdDBool a, SimdDBool b)
-{
- return { static_cast<__mmask8>(_mm512_kand(a.simdInternal_, b.simdInternal_)) };
-}
-
-static inline SimdDBool gmx_simdcall operator||(SimdDBool a, SimdDBool b)
-{
- return { static_cast<__mmask8>(_mm512_kor(a.simdInternal_, b.simdInternal_)) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdDBool a)
-{
- return _mm512_mask2int(a.simdInternal_) != 0;
-}
-
-static inline SimdDouble gmx_simdcall selectByMask(SimdDouble a, SimdDBool m)
-{
- return { _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall selectByNotMask(SimdDouble a, SimdDBool m)
-{
- return { _mm512_mask_mov_pd(a.simdInternal_, m.simdInternal_, _mm512_setzero_pd()) };
-}
-
-static inline SimdDouble gmx_simdcall blend(SimdDouble a, SimdDouble b, SimdDBool sel)
-{
- return { _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator&(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_and_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall andNot(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator|(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_or_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator^(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_xor_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator+(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_add_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator-(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_sub_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator*(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDIBool gmx_simdcall operator==(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ) };
-}
-
-static inline SimdDIBool gmx_simdcall testBits(SimdDInt32 a)
-{
- return { _mm512_cmp_epi32_mask(a.simdInternal_, _mm512_setzero_si512(), _MM_CMPINT_NE) };
-}
-
-static inline SimdDIBool gmx_simdcall operator<(SimdDInt32 a, SimdDInt32 b)
-{
- return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT) };
-}
-
-static inline SimdDIBool gmx_simdcall operator&&(SimdDIBool a, SimdDIBool b)
-{
- return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDIBool gmx_simdcall operator||(SimdDIBool a, SimdDIBool b)
-{
- return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdDIBool a)
-{
- return (_mm512_mask2int(a.simdInternal_) & 0xFF) != 0;
-}
-
-static inline SimdDInt32 gmx_simdcall selectByMask(SimdDInt32 a, SimdDIBool m)
-{
- return { _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall selectByNotMask(SimdDInt32 a, SimdDIBool m)
-{
- return { _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32()) };
-}
-
-static inline SimdDInt32 gmx_simdcall blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
-{
- return { _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall cvtR2I(SimdDouble a)
-{
- return { _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT) };
-}
-
-static inline SimdDInt32 gmx_simdcall cvttR2I(SimdDouble a)
-{
- return { _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_ZERO) };
-}
-
-static inline SimdDouble gmx_simdcall cvtI2R(SimdDInt32 a)
-{
- return { _mm512_cvtepi32lo_pd(a.simdInternal_) };
-}
-
-static inline SimdDIBool gmx_simdcall cvtB2IB(SimdDBool a)
-{
- return { a.simdInternal_ };
-}
-
-static inline SimdDBool gmx_simdcall cvtIB2B(SimdDIBool a)
-{
- return { static_cast<__mmask8>(a.simdInternal_) };
-}
-
-static inline void gmx_simdcall cvtF2DD(SimdFloat f, SimdDouble* d0, SimdDouble* d1)
-{
- __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f.simdInternal_), _MM_PERM_DCDC);
-
- *d0 = _mm512_cvtpslo_pd(f.simdInternal_);
- *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
-}
-
-static inline SimdFloat gmx_simdcall cvtDD2F(SimdDouble d0, SimdDouble d1)
-{
- __m512 f0 = _mm512_cvtpd_pslo(d0.simdInternal_);
- __m512 f1 = _mm512_cvtpd_pslo(d1.simdInternal_);
- return { _mm512_mask_permute4f128_ps(f0, _mm512_int2mask(0xFF00), f1, _MM_PERM_BABA) };
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/math/utilities.h"
-
-namespace gmx
-{
-
-class SimdFloat
-{
-public:
- SimdFloat() {}
-
- SimdFloat(float f) : simdInternal_(_mm512_set1_ps(f)) {}
-
- // Internal utility constructor to simplify return statements
- SimdFloat(__m512 simd) : simdInternal_(simd) {}
-
- __m512 simdInternal_;
-};
-
-class SimdFInt32
-{
-public:
- SimdFInt32() {}
-
- SimdFInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
-
- // Internal utility constructor to simplify return statements
- SimdFInt32(__m512i simd) : simdInternal_(simd) {}
-
- __m512i simdInternal_;
-};
-
-class SimdFBool
-{
-public:
- SimdFBool() {}
-
- SimdFBool(bool b) : simdInternal_(_mm512_int2mask(b ? 0xFFFF : 0)) {}
-
- // Internal utility constructor to simplify return statements
- SimdFBool(__mmask16 simd) : simdInternal_(simd) {}
-
- __mmask16 simdInternal_;
-};
-
-class SimdFIBool
-{
-public:
- SimdFIBool() {}
-
- SimdFIBool(bool b) : simdInternal_(_mm512_int2mask(b ? 0xFFFF : 0)) {}
-
- // Internal utility constructor to simplify return statements
- SimdFIBool(__mmask16 simd) : simdInternal_(simd) {}
-
- __mmask16 simdInternal_;
-};
-
-static inline SimdFloat gmx_simdcall simdLoad(const float* m, SimdFloatTag = {})
-{
- assert(std::size_t(m) % 64 == 0);
- return { _mm512_load_ps(m) };
-}
-
-static inline void gmx_simdcall store(float* m, SimdFloat a)
-{
- assert(std::size_t(m) % 64 == 0);
- _mm512_store_ps(m, a.simdInternal_);
-}
-
-static inline SimdFloat gmx_simdcall simdLoadU(const float* m, SimdFloatTag = {})
-{
- return { _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m + 16) };
-}
-
-static inline void gmx_simdcall storeU(float* m, SimdFloat a)
-{
- _mm512_packstorelo_ps(m, a.simdInternal_);
- _mm512_packstorehi_ps(m + 16, a.simdInternal_);
-}
-
-static inline SimdFloat gmx_simdcall setZeroF()
-{
- return { _mm512_setzero_ps() };
-}
-
-static inline SimdFInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdFInt32Tag)
-{
- assert(std::size_t(m) % 64 == 0);
- return { _mm512_load_epi32(m) };
-}
-
-static inline void gmx_simdcall store(std::int32_t* m, SimdFInt32 a)
-{
- assert(std::size_t(m) % 64 == 0);
- _mm512_store_epi32(m, a.simdInternal_);
-}
-
-static inline SimdFInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdFInt32Tag)
-{
- return { _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m + 16) };
-}
-
-static inline void gmx_simdcall storeU(std::int32_t* m, SimdFInt32 a)
-{
- _mm512_packstorelo_epi32(m, a.simdInternal_);
- _mm512_packstorehi_epi32(m + 16, a.simdInternal_);
-}
-
-static inline SimdFInt32 gmx_simdcall setZeroFI()
-{
- return { _mm512_setzero_si512() };
-}
-
-
-template<int index>
-static inline std::int32_t gmx_simdcall extract(SimdFInt32 a)
-{
- int r;
- _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1 << index), a.simdInternal_);
- return r;
-}
-
-static inline SimdFloat gmx_simdcall operator&(SimdFloat a, SimdFloat b)
-{
- return { _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall andNot(SimdFloat a, SimdFloat b)
-{
- return { _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall operator|(SimdFloat a, SimdFloat b)
-{
- return { _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall operator^(SimdFloat a, SimdFloat b)
-{
- return { _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall operator+(SimdFloat a, SimdFloat b)
-{
- return { _mm512_add_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall operator-(SimdFloat a, SimdFloat b)
-{
- return { _mm512_sub_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall operator-(SimdFloat x)
-{
- return { _mm512_addn_ps(x.simdInternal_, _mm512_setzero_ps()) };
-}
-
-static inline SimdFloat gmx_simdcall operator*(SimdFloat a, SimdFloat b)
-{
- return { _mm512_mul_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fma(SimdFloat a, SimdFloat b, SimdFloat c)
-{
- return { _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fms(SimdFloat a, SimdFloat b, SimdFloat c)
-{
- return { _mm512_fmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fnma(SimdFloat a, SimdFloat b, SimdFloat c)
-{
- return { _mm512_fnmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fnms(SimdFloat a, SimdFloat b, SimdFloat c)
-{
- return { _mm512_fnmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall rsqrt(SimdFloat x)
-{
- return { _mm512_rsqrt23_ps(x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall rcp(SimdFloat x)
-{
- return { _mm512_rcp23_ps(x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
-{
- return { _mm512_mask_add_ps(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
-{
- return { _mm512_mask_mul_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
-{
- return { _mm512_mask_mov_ps(_mm512_setzero_ps(),
- m.simdInternal_,
- _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)) };
-}
-
-static inline SimdFloat gmx_simdcall maskzRsqrt(SimdFloat x, SimdFBool m)
-{
- return { _mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskzRcp(SimdFloat x, SimdFBool m)
-{
- return { _mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall abs(SimdFloat x)
-{
- return { _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
- _mm512_castps_si512(x.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall max(SimdFloat a, SimdFloat b)
-{
- return { _mm512_gmax_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall min(SimdFloat a, SimdFloat b)
-{
- return { _mm512_gmin_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall round(SimdFloat x)
-{
- return { _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFloat gmx_simdcall trunc(SimdFloat x)
-{
- return { _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall frexp(SimdFloat value, SimdFInt32* exponent)
-{
- __m512 rExponent;
- __m512i iExponent;
- __m512 result;
-
- if (opt == MathOptimization::Safe)
- {
- // For the safe branch, we use the masked operations to only assign results if the
- // input value was nonzero, and otherwise set exponent to 0, and the fraction to the input (+-0).
- __mmask16 valueIsNonZero =
- _mm512_cmp_ps_mask(_mm512_setzero_ps(), value.simdInternal_, _CMP_NEQ_OQ);
- rExponent = _mm512_mask_getexp_ps(_mm512_setzero_ps(), valueIsNonZero, value.simdInternal_);
- iExponent = _mm512_cvtfxpnt_round_adjustps_epi32(
- rExponent, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
- iExponent = _mm512_mask_add_epi32(iExponent, valueIsNonZero, iExponent, _mm512_set1_epi32(1));
-
- // Set result to input value when the latter is +-0
- result = _mm512_mask_getmant_ps(
- value.simdInternal_, valueIsNonZero, value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
- }
- else
- {
- rExponent = _mm512_getexp_ps(value.simdInternal_);
- iExponent = _mm512_cvtfxpnt_round_adjustps_epi32(
- rExponent, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
- iExponent = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
- result = _mm512_getmant_ps(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
- }
-
- exponent->simdInternal_ = iExponent;
-
- return { result };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall ldexp(SimdFloat value, SimdFInt32 exponent)
-{
- const __m512i exponentBias = _mm512_set1_epi32(127);
- __m512i iExponent = _mm512_add_epi32(exponent.simdInternal_, exponentBias);
-
- if (opt == MathOptimization::Safe)
- {
- // Make sure biased argument is not negative
- iExponent = _mm512_max_epi32(iExponent, _mm512_setzero_epi32());
- }
-
- iExponent = _mm512_slli_epi32(iExponent, 23);
-
- return { _mm512_mul_ps(value.simdInternal_, _mm512_castsi512_ps(iExponent)) };
-}
-
-static inline float gmx_simdcall reduce(SimdFloat a)
-{
- return _mm512_reduce_add_ps(a.simdInternal_);
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline SimdFBool gmx_simdcall operator==(SimdFloat a, SimdFloat b)
-{
- return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline SimdFBool gmx_simdcall operator!=(SimdFloat a, SimdFloat b)
-{
- return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline SimdFBool gmx_simdcall operator<(SimdFloat a, SimdFloat b)
-{
- return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline SimdFBool gmx_simdcall operator<=(SimdFloat a, SimdFloat b)
-{
- return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline SimdFBool gmx_simdcall testBits(SimdFloat a)
-{
- return { _mm512_test_epi32_mask(_mm512_castps_si512(a.simdInternal_),
- _mm512_castps_si512(a.simdInternal_)) };
-}
-
-static inline SimdFBool gmx_simdcall operator&&(SimdFBool a, SimdFBool b)
-{
- return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFBool gmx_simdcall operator||(SimdFBool a, SimdFBool b)
-{
- return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdFBool a)
-{
- return _mm512_mask2int(a.simdInternal_) != 0;
-}
-
-static inline SimdFloat gmx_simdcall selectByMask(SimdFloat a, SimdFBool m)
-{
- return { _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall selectByNotMask(SimdFloat a, SimdFBool m)
-{
- return { _mm512_mask_mov_ps(a.simdInternal_, m.simdInternal_, _mm512_setzero_ps()) };
-}
-
-static inline SimdFloat gmx_simdcall blend(SimdFloat a, SimdFloat b, SimdFBool sel)
-{
- return { _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator&(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_and_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall andNot(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator|(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_or_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator^(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_xor_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator+(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_add_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator-(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_sub_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator*(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFIBool gmx_simdcall operator==(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ) };
-}
-
-static inline SimdFIBool gmx_simdcall testBits(SimdFInt32 a)
-{
- return { _mm512_test_epi32_mask(a.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdFIBool gmx_simdcall operator<(SimdFInt32 a, SimdFInt32 b)
-{
- return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT) };
-}
-
-static inline SimdFIBool gmx_simdcall operator&&(SimdFIBool a, SimdFIBool b)
-{
- return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFIBool gmx_simdcall operator||(SimdFIBool a, SimdFIBool b)
-{
- return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdFIBool a)
-{
- return _mm512_mask2int(a.simdInternal_) != 0;
-}
-
-static inline SimdFInt32 gmx_simdcall selectByMask(SimdFInt32 a, SimdFIBool m)
-{
- return { _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall selectByNotMask(SimdFInt32 a, SimdFIBool m)
-{
- return { _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32()) };
-}
-
-static inline SimdFInt32 gmx_simdcall blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
-{
- return { _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall cvtR2I(SimdFloat a)
-{
- return { _mm512_cvtfxpnt_round_adjustps_epi32(
- a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFInt32 gmx_simdcall cvttR2I(SimdFloat a)
-{
- return { _mm512_cvtfxpnt_round_adjustps_epi32(a.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFloat gmx_simdcall cvtI2R(SimdFInt32 a)
-{
- return { _mm512_cvtfxpnt_round_adjustepi32_ps(
- a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFIBool gmx_simdcall cvtB2IB(SimdFBool a)
-{
- return { a.simdInternal_ };
-}
-
-static inline SimdFBool gmx_simdcall cvtIB2B(SimdFIBool a)
-{
- return { a.simdInternal_ };
-}
-
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall exp2(SimdFloat x)
-{
- return { _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(
- x.simdInternal_, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24)) };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall exp(SimdFloat x)
-{
- const __m512 argscale = _mm512_set1_ps(1.44269504088896341F);
- const __m512 invargscale = _mm512_set1_ps(-0.69314718055994528623F);
-
- if (opt == MathOptimization::Safe)
- {
- // Set the limit to gurantee flush to zero
- const SimdFloat smallArgLimit(-88.f);
- // Since we multiply the argument by 1.44, for the safe version we need to make
- // sure this doesn't result in overflow
- x = max(x, smallArgLimit);
- }
-
- __m512 xscaled = _mm512_mul_ps(x.simdInternal_, argscale);
- __m512 r = _mm512_exp223_ps(
- _mm512_cvtfxpnt_round_adjustps_epi32(xscaled, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
-
- // exp2a23_ps provides 23 bits of accuracy, but we ruin some of that with our argument
- // scaling. To correct this, we find the difference between the scaled argument and
- // the true one (extended precision arithmetics does not appear to be necessary to
- // fulfill our accuracy requirements) and then multiply by the exponent of this
- // correction since exp(a+b)=exp(a)*exp(b).
- // Note that this only adds two instructions (and maybe some constant loads).
-
- // find the difference
- x = _mm512_fmadd_ps(invargscale, xscaled, x.simdInternal_);
- // x will now be a _very_ small number, so approximate exp(x)=1+x.
- // We should thus apply the correction as r'=r*(1+x)=r+r*x
- r = _mm512_fmadd_ps(r, x.simdInternal_, r);
- return { r };
-}
-
-static inline SimdFloat gmx_simdcall log(SimdFloat x)
-{
- return { _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764F),
- _mm512_log2ae23_ps(x.simdInternal_)) };
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
-#define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_double.h"
-
-namespace gmx
-{
-
-namespace
-{
-/* This is an internal helper function used by decr3Hsimd(...).
- */
-inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
-{
- __m512d t;
-
- assert(std::size_t(m) % 32 == 0);
-
- t = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
- a.simdInternal_ = _mm512_add_pd(
- a.simdInternal_,
- _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
- t = _mm512_sub_pd(t, a.simdInternal_);
- _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
-}
-} // namespace
-
-// On MIC it is better to use scatter operations, so we define the load routines
-// that use a SIMD offset variable first.
-
-template<int align>
-static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
- SimdDInt32 simdoffset,
- SimdDouble* v0,
- SimdDouble* v1,
- SimdDouble* v2,
- SimdDouble* v3)
-{
- assert((size_t)base % 32 == 0);
- assert(align % 4 == 0);
-
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdDInt32(align);
- }
-
- v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
- v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 1, sizeof(double));
- v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 2, sizeof(double));
- v3->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 3, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall gatherLoadUBySimdIntTranspose(const double* base,
- SimdDInt32 simdoffset,
- SimdDouble* v0,
- SimdDouble* v1)
-{
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- if (align == 2)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 1);
- }
- else if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdDInt32(align);
- }
-
- v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
- v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 1, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
- SimdDInt32 simdoffset,
- SimdDouble* v0,
- SimdDouble* v1)
-{
- assert(std::size_t(base) % 16 == 0);
- assert(align % 2 == 0);
- gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
-}
-
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTranspose(const double* base,
- const std::int32_t offset[],
- SimdDouble* v0,
- SimdDouble* v1,
- SimdDouble* v2,
- SimdDouble* v3)
-{
- gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1, v2, v3);
-}
-
-template<int align>
-static inline void gmx_simdcall
- gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
-{
- gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1);
-}
-
-static const int c_simdBestPairAlignmentDouble = 2;
-
-template<int align>
-static inline void gmx_simdcall gatherLoadUTranspose(const double* base,
- const std::int32_t offset[],
- SimdDouble* v0,
- SimdDouble* v1,
- SimdDouble* v2)
-{
- SimdDInt32 simdoffset;
-
- assert(std::size_t(offset) % 32 == 0);
-
- simdoffset = simdLoad(offset, SimdDInt32Tag());
-
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdDInt32(align);
- }
-
- v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
- v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 1, sizeof(double));
- v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 2, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall transposeScatterStoreU(double* base,
- const std::int32_t offset[],
- SimdDouble v0,
- SimdDouble v1,
- SimdDouble v2)
-{
- SimdDInt32 simdoffset;
-
- assert(std::size_t(offset) % 32 == 0);
-
- simdoffset = simdLoad(offset, SimdDInt32Tag());
-
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdDInt32(align);
- }
-
- _mm512_i32loscatter_pd(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(double));
- _mm512_i32loscatter_pd(base + 1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double));
- _mm512_i32loscatter_pd(base + 2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall
- transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
-{
- alignas(GMX_SIMD_ALIGNMENT) double rdata0[GMX_SIMD_DOUBLE_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) double rdata1[GMX_SIMD_DOUBLE_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) double rdata2[GMX_SIMD_DOUBLE_WIDTH];
-
- store(rdata0, v0);
- store(rdata1, v1);
- store(rdata2, v2);
-
- for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
- {
- base[align * offset[i] + 0] += rdata0[i];
- base[align * offset[i] + 1] += rdata1[i];
- base[align * offset[i] + 2] += rdata2[i];
- }
-}
-
-template<int align>
-static inline void gmx_simdcall
- transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
-{
- alignas(GMX_SIMD_ALIGNMENT) double rdata0[GMX_SIMD_DOUBLE_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) double rdata1[GMX_SIMD_DOUBLE_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) double rdata2[GMX_SIMD_DOUBLE_WIDTH];
-
- store(rdata0, v0);
- store(rdata1, v1);
- store(rdata2, v2);
-
- for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
- {
- base[align * offset[i] + 0] -= rdata0[i];
- base[align * offset[i] + 1] -= rdata1[i];
- base[align * offset[i] + 2] -= rdata2[i];
- }
-}
-
-static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble scalar,
- SimdDouble* triplets0,
- SimdDouble* triplets1,
- SimdDouble* triplets2)
-{
- triplets0->simdInternal_ = _mm512_castsi512_pd(
- _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
- _mm512_castpd_si512(scalar.simdInternal_)));
- triplets1->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(
- _mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
- _mm512_castpd_si512(scalar.simdInternal_)));
- triplets2->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(
- _mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
- _mm512_castpd_si512(scalar.simdInternal_)));
-}
-
-
-static inline double gmx_simdcall
- reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
-{
- double d;
- __m512d t0, t1, t2, t3;
-
- assert(std::size_t(m) % 32 == 0);
-
- t0 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0.simdInternal_, v2.simdInternal_),
- _MM_SWIZ_REG_BADC);
- t2 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2.simdInternal_, v0.simdInternal_);
- t1 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1.simdInternal_, v3.simdInternal_),
- _MM_SWIZ_REG_BADC);
- t3 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3.simdInternal_, v1.simdInternal_);
- t0 = _mm512_add_pd(t0, t2);
- t1 = _mm512_add_pd(t1, t3);
-
- t2 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0, t1), _MM_SWIZ_REG_CDAB);
- t3 = _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1, t0);
- t2 = _mm512_add_pd(t2, t3);
-
- t2 = _mm512_add_pd(t2, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2), _MM_PERM_BADC)));
-
- t0 = _mm512_mask_extload_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
- t0 = _mm512_add_pd(t0, t2);
- _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t0);
-
- t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_BADC));
- t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_CDAB));
-
- _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x01), t2);
- return d;
-}
-
-static inline SimdDouble gmx_simdcall loadDualHsimd(const double* m0, const double* m1)
-{
- assert(std::size_t(m0) % 32 == 0);
- assert(std::size_t(m1) % 32 == 0);
-
- return _mm512_mask_extload_pd(_mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
- _mm512_int2mask(0xF0),
- m1,
- _MM_UPCONV_PD_NONE,
- _MM_BROADCAST_4X8,
- _MM_HINT_NONE);
-}
-
-static inline SimdDouble gmx_simdcall loadDuplicateHsimd(const double* m)
-{
- assert(std::size_t(m) % 32 == 0);
-
- return _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
-}
-
-static inline SimdDouble gmx_simdcall loadU1DualHsimd(const double* m)
-{
- return _mm512_mask_extload_pd(_mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE),
- _mm512_int2mask(0xF0),
- m + 1,
- _MM_UPCONV_PD_NONE,
- _MM_BROADCAST_1X8,
- _MM_HINT_NONE);
-}
-
-
-static inline void gmx_simdcall storeDualHsimd(double* m0, double* m1, SimdDouble a)
-{
- assert(std::size_t(m0) % 32 == 0);
- assert(std::size_t(m1) % 32 == 0);
-
- _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), a.simdInternal_);
- _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), a.simdInternal_);
-}
-
-static inline void gmx_simdcall incrDualHsimd(double* m0, double* m1, SimdDouble a)
-{
- assert(std::size_t(m0) % 32 == 0);
- assert(std::size_t(m1) % 32 == 0);
-
- __m512d x;
-
- // Update lower half
- x = _mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
- x = _mm512_add_pd(x, a.simdInternal_);
- _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), x);
-
- // Update upper half
- x = _mm512_extload_pd(m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
- x = _mm512_add_pd(x, a.simdInternal_);
- _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
-}
-
-static inline void gmx_simdcall decr3Hsimd(double* m, SimdDouble a0, SimdDouble a1, SimdDouble a2)
-{
- assert(std::size_t(m) % 32 == 0);
- decrHsimd(m, a0);
- decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH / 2, a1);
- decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH, a2);
-}
-
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTransposeHsimd(const double* base0,
- const double* base1,
- const std::int32_t offset[],
- SimdDouble* v0,
- SimdDouble* v1)
-{
- __m512i idx0, idx1, idx;
- __m512d tmp1, tmp2;
-
- assert(std::size_t(offset) % 16 == 0);
- assert(std::size_t(base0) % 16 == 0);
- assert(std::size_t(base1) % 16 == 0);
- assert(std::size_t(align) % 2 == 0);
-
- idx0 = _mm512_extload_epi32(offset, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
-
- idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
- idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
-
- idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0x00F0), idx1, _MM_PERM_AAAA);
-
- tmp1 = _mm512_i32logather_pd(idx, base0, sizeof(double));
- tmp2 = _mm512_i32logather_pd(idx, base1, sizeof(double));
-
- v0->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(
- _mm512_castpd_ps(tmp1), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2), _MM_PERM_BABA));
- v1->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(
- _mm512_castpd_ps(tmp2), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1), _MM_PERM_DCDC));
-}
-
-static inline double gmx_simdcall reduceIncr4ReturnSumHsimd(double* m, SimdDouble v0, SimdDouble v1)
-{
- double d;
- __m512d t0, t1;
-
- assert(std::size_t(m) % 32 == 0);
-
- t0 = _mm512_add_pd(v0.simdInternal_, _mm512_swizzle_pd(v0.simdInternal_, _MM_SWIZ_REG_BADC));
- t0 = _mm512_mask_add_pd(t0,
- _mm512_int2mask(0xCC),
- v1.simdInternal_,
- _mm512_swizzle_pd(v1.simdInternal_, _MM_SWIZ_REG_BADC));
- t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
- t0 = _mm512_castps_pd(_mm512_mask_permute4f128_ps(
- _mm512_castpd_ps(t0), _mm512_int2mask(0xCCCC), _mm512_castpd_ps(t0), _MM_PERM_DCDC));
-
- t1 = _mm512_mask_extload_pd(
- _mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
- t1 = _mm512_add_pd(t1, t0);
- _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t1);
-
- t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_BADC));
- t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
-
- _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x03), t0);
- return d;
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
-#define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_float.h"
-
-namespace gmx
-{
-
-namespace
-{
-/* This is an internal helper function used by decr3Hsimd(...).
- */
-inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
-{
- __m512 t;
-
- assert(std::size_t(m) % 32 == 0);
-
- t = _mm512_castpd_ps(_mm512_extload_pd(
- reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
- a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
- t = _mm512_sub_ps(t, a.simdInternal_);
- _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
-}
-} // namespace
-
-// On MIC it is better to use scatter operations, so we define the load routines
-// that use a SIMD offset variable first.
-
-template<int align>
-static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
- SimdFInt32 simdoffset,
- SimdFloat* v0,
- SimdFloat* v1,
- SimdFloat* v2,
- SimdFloat* v3)
-{
- assert(std::size_t(base) % 16 == 0);
- assert(align % 4 == 0);
-
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdFInt32(align);
- }
-
- v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
- v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
- v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
- v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 3, sizeof(float));
-}
-
-template<int align>
-static inline void gmx_simdcall
- gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
-{
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- // For align == 2 we can merge the constant into the scale parameter,
- // which can take constants up to 8 in total.
- if (align == 2)
- {
- v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, align * sizeof(float));
- v1->simdInternal_ =
- _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, align * sizeof(float));
- }
- else
- {
- if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdFInt32(align);
- }
- v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
- v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
- }
-}
-
-template<int align>
-static inline void gmx_simdcall
- gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
-{
- assert(std::size_t(base) % 8 == 0);
- assert(align % 2 == 0);
- gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
-}
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTranspose(const float* base,
- const std::int32_t offset[],
- SimdFloat* v0,
- SimdFloat* v1,
- SimdFloat* v2,
- SimdFloat* v3)
-{
- gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1, v2, v3);
-}
-
-template<int align>
-static inline void gmx_simdcall
- gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
-{
- gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1);
-}
-
-static const int c_simdBestPairAlignmentFloat = 2;
-
-template<int align>
-static inline void gmx_simdcall gatherLoadUTranspose(const float* base,
- const std::int32_t offset[],
- SimdFloat* v0,
- SimdFloat* v1,
- SimdFloat* v2)
-{
- SimdFInt32 simdoffset;
-
- assert(std::size_t(offset) % 64 == 0);
-
- simdoffset = simdLoad(offset, SimdFInt32Tag());
-
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdFInt32(align);
- }
-
- v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
- v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
- v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
-}
-
-
-template<int align>
-static inline void gmx_simdcall
- transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
-{
- SimdFInt32 simdoffset;
-
- assert(std::size_t(offset) % 64 == 0);
-
- simdoffset = simdLoad(offset, SimdFInt32Tag());
-
- // All instructions might be latency ~4 on MIC, so we use shifts where we
- // only need a single instruction (since the shift parameter is an immediate),
- // but multiplication otherwise.
- if (align == 4)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
- }
- else if (align == 8)
- {
- simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
- }
- else
- {
- simdoffset = simdoffset * SimdFInt32(align);
- }
-
- _mm512_i32scatter_ps(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
- _mm512_i32scatter_ps(base + 1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
- _mm512_i32scatter_ps(base + 2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
-}
-
-
-template<int align>
-static inline void gmx_simdcall
- transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
-{
- alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
-
- store(rdata0, v0);
- store(rdata1, v1);
- store(rdata2, v2);
-
- for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
- {
- base[align * offset[i] + 0] += rdata0[i];
- base[align * offset[i] + 1] += rdata1[i];
- base[align * offset[i] + 2] += rdata2[i];
- }
-}
-
-template<int align>
-static inline void gmx_simdcall
- transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
-{
- alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
- alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
-
- store(rdata0, v0);
- store(rdata1, v1);
- store(rdata2, v2);
-
- for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
- {
- base[align * offset[i] + 0] -= rdata0[i];
- base[align * offset[i] + 1] -= rdata1[i];
- base[align * offset[i] + 2] -= rdata2[i];
- }
-}
-
-static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat scalar,
- SimdFloat* triplets0,
- SimdFloat* triplets1,
- SimdFloat* triplets2)
-{
- triplets0->simdInternal_ = _mm512_castsi512_ps(
- _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
- _mm512_castps_si512(scalar.simdInternal_)));
- triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
- _mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
- _mm512_castps_si512(scalar.simdInternal_)));
- triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
- _mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
- _mm512_castps_si512(scalar.simdInternal_)));
-}
-
-
-static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
-{
- float f;
- __m512 t0, t1, t2, t3;
-
- assert(std::size_t(m) % 16 == 0);
-
- t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
- t0 = _mm512_mask_add_ps(t0,
- _mm512_int2mask(0xCCCC),
- v2.simdInternal_,
- _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
- t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
- t1 = _mm512_mask_add_ps(t1,
- _mm512_int2mask(0xCCCC),
- v3.simdInternal_,
- _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
- t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
- t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
-
- t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
- t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
-
- t0 = _mm512_mask_extload_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
- t0 = _mm512_add_ps(t0, t2);
- _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
-
- t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_BADC));
- t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_CDAB));
-
- _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t2);
- return f;
-}
-
-static inline SimdFloat gmx_simdcall loadDualHsimd(const float* m0, const float* m1)
-{
- assert(std::size_t(m0) % 32 == 0);
- assert(std::size_t(m1) % 32 == 0);
-
- return _mm512_castpd_ps(_mm512_mask_extload_pd(
- _mm512_extload_pd(reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
- _mm512_int2mask(0xF0),
- reinterpret_cast<const double*>(m1),
- _MM_UPCONV_PD_NONE,
- _MM_BROADCAST_4X8,
- _MM_HINT_NONE));
-}
-
-static inline SimdFloat gmx_simdcall loadDuplicateHsimd(const float* m)
-{
- assert(std::size_t(m) % 32 == 0);
-
- return _mm512_castpd_ps(_mm512_extload_pd(
- reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
-}
-
-static inline SimdFloat gmx_simdcall loadU1DualHsimd(const float* m)
-{
- return _mm512_mask_extload_ps(_mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE),
- _mm512_int2mask(0xFF00),
- m + 1,
- _MM_UPCONV_PS_NONE,
- _MM_BROADCAST_1X16,
- _MM_HINT_NONE);
-}
-
-
-static inline void gmx_simdcall storeDualHsimd(float* m0, float* m1, SimdFloat a)
-{
- __m512 t0;
-
- assert(std::size_t(m0) % 32 == 0);
- assert(std::size_t(m1) % 32 == 0);
-
- _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), a.simdInternal_);
- _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
-}
-
-static inline void gmx_simdcall incrDualHsimd(float* m0, float* m1, SimdFloat a)
-{
- assert(std::size_t(m0) % 32 == 0);
- assert(std::size_t(m1) % 32 == 0);
-
- __m512 x;
-
- // Update lower half
- x = _mm512_castpd_ps(_mm512_extload_pd(
- reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
- x = _mm512_add_ps(x, a.simdInternal_);
- _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
-
- // Update upper half
- x = _mm512_castpd_ps(_mm512_extload_pd(
- reinterpret_cast<const double*>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
- x = _mm512_add_ps(x, a.simdInternal_);
- _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
-}
-
-static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
-{
- assert(std::size_t(m) % 32 == 0);
- decrHsimd(m, a0);
- decrHsimd(m + GMX_SIMD_FLOAT_WIDTH / 2, a1);
- decrHsimd(m + GMX_SIMD_FLOAT_WIDTH, a2);
-}
-
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTransposeHsimd(const float* base0,
- const float* base1,
- const std::int32_t offset[],
- SimdFloat* v0,
- SimdFloat* v1)
-{
- __m512i idx0, idx1, idx;
- __m512 tmp1, tmp2;
-
- assert(std::size_t(offset) % 32 == 0);
- assert(std::size_t(base0) % 8 == 0);
- assert(std::size_t(base1) % 8 == 0);
- assert(std::size_t(align) % 2 == 0);
-
- idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset);
-
- idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
- idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
-
- idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0xFF00), idx1, _MM_PERM_BABA);
-
- tmp1 = _mm512_i32gather_ps(idx, base0, sizeof(float));
- tmp2 = _mm512_i32gather_ps(idx, base1, sizeof(float));
-
- v0->simdInternal_ = _mm512_mask_permute4f128_ps(tmp1, _mm512_int2mask(0xFF00), tmp2, _MM_PERM_BABA);
- v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
-}
-
-static inline float gmx_simdcall reduceIncr4ReturnSumHsimd(float* m, SimdFloat v0, SimdFloat v1)
-{
- float f;
- __m512 t0, t1;
-
- assert(std::size_t(m) % 32 == 0);
-
- t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
- t0 = _mm512_mask_add_ps(t0,
- _mm512_int2mask(0xCCCC),
- v1.simdInternal_,
- _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
- t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
- t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
- t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
- t1 = _mm512_mask_extload_ps(
- _mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
- t1 = _mm512_add_ps(t1, t0);
- _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
-
- t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_BADC));
- t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
-
- _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t0);
- return f;
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
# include "impl_x86_avx2_256/impl_x86_avx2_256.h"
#elif GMX_SIMD_X86_AVX2_128
# include "impl_x86_avx2_128/impl_x86_avx2_128.h"
-#elif GMX_SIMD_X86_MIC
-# include "impl_x86_mic/impl_x86_mic.h"
#elif GMX_SIMD_X86_AVX_512
# include "impl_x86_avx_512/impl_x86_avx_512.h"
#elif GMX_SIMD_X86_AVX_512_KNL
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2015,2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{ SimdType::X86_Avx2_128, "AVX2_128" },
{ SimdType::X86_Avx512, "AVX_512" },
{ SimdType::X86_Avx512Knl, "AVX_512_KNL" },
- { SimdType::X86_Mic, "X86_MIC" },
{ SimdType::Arm_Neon, "ARM_NEON" },
{ SimdType::Arm_NeonAsimd, "ARM_NEON_ASIMD" },
{ SimdType::Arm_Sve, "ARM_SVE" },
return SimdType::X86_Avx512Knl;
#elif GMX_SIMD_X86_AVX_512
return SimdType::X86_Avx512;
-#elif GMX_SIMD_X86_MIC
- return SimdType::X86_Mic;
#elif GMX_SIMD_X86_AVX2_256
return SimdType::X86_Avx2;
#elif GMX_SIMD_X86_AVX2_128
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
X86_Avx2_128, //!< 128-bit AVX2, better than 256-bit for AMD Ryzen
X86_Avx512, //!< AVX_512
X86_Avx512Knl, //!< AVX_512_KNL
- X86_Mic, //!< Knight's corner
Arm_Neon, //!< 32-bit ARM NEON
Arm_NeonAsimd, //!< 64-bit ARM AArch64 Advanced SIMD
Arm_Sve, //!< ARM Scalable Vector Extensions
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
YieldProcessor();
#elif HAVE_XMMINTRIN_H
_mm_pause();
-#elif defined __MIC__
- _mm_delay_32(32);
#else
// No wait for unknown architecture
#endif