Remove x86 MIC support

author Mark Abraham <mark.j.abraham@gmail.com>

Mon, 1 Feb 2021 12:16:44 +0000 (12:16 +0000)

committer Joe Jordan <ejjordan12@gmail.com>

Mon, 1 Feb 2021 12:16:44 +0000 (12:16 +0000)
author Mark Abraham <mark.j.abraham@gmail.com>
Mon, 1 Feb 2021 12:16:44 +0000 (12:16 +0000)
committer Joe Jordan <ejjordan12@gmail.com>
Mon, 1 Feb 2021 12:16:44 +0000 (12:16 +0000)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index d0d8d0f8dd3e78cb7984f589735b1ee05ad663eb..456048a64f40d9f73d3eb01013633dc06666f8ac 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,13 +204,9 @@ gmx_option_multichoice(
      GMX_SIMD
      "SIMD instruction set for CPU kernels and compiler optimization"
      "AUTO"
-    AUTO None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 AVX2_128 AVX_512 AVX_512_KNL MIC ARM_NEON ARM_NEON_ASIMD ARM_SVE IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
+    AUTO None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 AVX2_128 AVX_512 AVX_512_KNL ARM_NEON ARM_NEON_ASIMD ARM_SVE IBM_VMX IBM_VSX Sparc64_HPC_ACE Reference)
  
-if(GMX_TARGET_MIC)
-    set(GMX_FFT_LIBRARY_DEFAULT "mkl")
-else()
-    set(GMX_FFT_LIBRARY_DEFAULT "fftw3")
-endif()
+set(GMX_FFT_LIBRARY_DEFAULT "fftw3")
  
  gmx_option_multichoice(
      GMX_FFT_LIBRARY
diff --git a/cmake/TestMIC.cpp b/cmake/TestMIC.cpp

deleted file mode 100644 (file)

index c03a61b..0000000
--- a/cmake/TestMIC.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-int main()
-{
-#ifdef __MIC__
-    return 0;
-#else
-#error This compiler is not targetting MIC
-#endif
-}
diff --git a/cmake/gmxDetectSimd.cmake b/cmake/gmxDetectSimd.cmake

index 781e9a2e5619d76303a26b4848893af5eaf83057..f3e557ceeceb3f3033ddfdafabae74d446bd3ad0 100644 (file)
--- a/cmake/gmxDetectSimd.cmake
+++ b/cmake/gmxDetectSimd.cmake
@@ -2,7 +2,7 @@
  # This file is part of the GROMACS molecular simulation package.
  #
  # Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
-# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -155,8 +155,6 @@ function(gmx_detect_simd _suggested_simd)
              # HPC-ACE is always present. In the future we
              # should add detection for HPC-ACE2 here.
              set(${_suggested_simd} "Sparc64_HPC_ACE")
-        elseif(GMX_TARGET_MIC)
-            set(${_suggested_simd} "MIC")
          else()
              gmx_suggest_simd(${_suggested_simd})
          endif()
diff --git a/cmake/gmxDetectTargetArchitecture.cmake b/cmake/gmxDetectTargetArchitecture.cmake

index ad5734db7a6f211e1124f0112e3380e4b0da208a..293c7e165a335b6a00cd7c092b44fff8c0358ee6 100644 (file)
--- a/cmake/gmxDetectTargetArchitecture.cmake
+++ b/cmake/gmxDetectTargetArchitecture.cmake
@@ -1,7 +1,8 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2013,2014,2016,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2013,2014,2016,2018,2019,2020, by the GROMACS development team.
+# Copyright (c) 2021, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -44,13 +45,6 @@ function(gmx_detect_target_architecture)
          try_compile(GMX_TARGET_X86 ${CMAKE_BINARY_DIR}
              "${CMAKE_SOURCE_DIR}/cmake/TestX86.cpp")
      endif()
-    if (NOT DEFINED GMX_TARGET_MIC)
-        try_compile(GMX_TARGET_MIC ${CMAKE_BINARY_DIR}
-            "${CMAKE_SOURCE_DIR}/cmake/TestMIC.cpp")
-    endif()
-    if (GMX_TARGET_MIC)
-        message(STATUS "The Intel MIC KNC target is deprecated")
-    endif()
      if (NOT DEFINED GMX_TARGET_FUJITSU_SPARC64)
          try_compile(GMX_TARGET_FUJITSU_SPARC64 ${CMAKE_BINARY_DIR}
              "${CMAKE_SOURCE_DIR}/cmake/TestFujitsuSparc64.cpp")
diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake

index af870d4fbaaffea8d0458be57d211b2c80833a47..ba883ea85a30cfa0826902ee4c169c34b30a31a1 100644 (file)
--- a/cmake/gmxManageSimd.cmake
+++ b/cmake/gmxManageSimd.cmake
@@ -2,7 +2,7 @@
  # This file is part of the GROMACS molecular simulation package.
  #
  # Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
-# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -177,11 +177,6 @@ elseif(GMX_SIMD_ACTIVE MATCHES "AVX2_")
          set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions using CXX flags: ${SIMD_AVX2_CXX_FLAGS}")
      endif()
  
-elseif(GMX_SIMD_ACTIVE STREQUAL "MIC")
-    # No flags needed. Not testing.
-    set(GMX_SIMD_X86_MIC 1)
-    set(SIMD_STATUS_MESSAGE "Enabling MIC (Xeon Phi) SIMD instructions without special flags. This SIMD support is deprecated.")
-
  elseif(GMX_SIMD_ACTIVE STREQUAL "AVX_512")
  
      gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED
diff --git a/docs/doxygen/lib/simd.md b/docs/doxygen/lib/simd.md

index 1363f3728f5443f56adf4af1a94c87cf2effa3b6..a1a6d53e121fac817df33482b3541de176218b88 100644 (file)
--- a/docs/doxygen/lib/simd.md
+++ b/docs/doxygen/lib/simd.md
@@ -52,7 +52,7 @@ and for this reason they are part of the SIMD implementation.
  Finally, for some architectures with large or very large SIMD width (e.g. AVX
  with 8 elements in single precision, or AVX-512 with 16), the nonbonded
  kernels can become inefficient. Since all such architectures presently known
-(AVX, AVX2, MIC, AVX512) also provide extensive support for accessing
+(AVX, AVX2, AVX512) also provide extensive support for accessing
  parts of the register, we optionally define a handful of routines to
  perform load, store, and reduce operations based on half-SIMD-width data,
  which can improve performance. It is only useful for wide implementations,
diff --git a/docs/release-notes/2022/major/removed-functionality.rst b/docs/release-notes/2022/major/removed-functionality.rst

index f94042ac41adda0bdb050c89a578b8eedce40d72..70ac8ac1ee19fbb9423dcb419f2d4ca82f34df1b 100644 (file)
--- a/docs/release-notes/2022/major/removed-functionality.rst
+++ b/docs/release-notes/2022/major/removed-functionality.rst
@@ -23,3 +23,11 @@ in GROMACS 2021. Removing it will simplify maintenance, testing,
  documentation, installation, and teaching new users.
  
  :issue:`3808`
+
+Removed support for x86 MIC platform
+""""""""""""""""""""""""""""""""""""
+
+This platform is nearly dead and is no longer supported. The KNL
+platform is unaffected by this change.
+
+:issue:`3891`
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 86dd075bd10fb1422f4768aef540aee9a32f3db0..7021f6df57263dbd63ab86b2ce344795df7c153a 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -94,9 +94,6 @@
  /* AVX2 128-bit SIMD instruction set level was selected */
  #cmakedefine01 GMX_SIMD_X86_AVX2_128
  
-/* MIC (Xeon Phi) SIMD instruction set level was selected */
-#cmakedefine01 GMX_SIMD_X86_MIC
-
  /* AVX-512F foundation level instruction SIMD */
  #cmakedefine01 GMX_SIMD_X86_AVX_512
  
diff --git a/src/external/thread_mpi/include/thread_mpi/atomic/gcc_x86.h b/src/external/thread_mpi/include/thread_mpi/atomic/gcc_x86.h

index 1dc5d995d87dc0b2173e3e0c7d6cc69a29af0b24..a2d72afb81d58944eda879bdf3c7c0f5c71c504d 100644 (file)
--- a/src/external/thread_mpi/include/thread_mpi/atomic/gcc_x86.h
+++ b/src/external/thread_mpi/include/thread_mpi/atomic/gcc_x86.h
@@ -109,12 +109,7 @@ typedef struct tMPI_Spinlock
  #else
  /* older versions of gcc don't support atomic intrinsics */
  
-#ifndef __MIC__
  #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
-#else
-/* MIC is in-order and does not need nor support sfense */
-#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
-#endif
  
  #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
  static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
diff --git a/src/gromacs/mdlib/rbin.cpp b/src/gromacs/mdlib/rbin.cpp

index 0e8dc749bc89eee14a6b863df60dc0c8cbe0b063..d499c9403471a589ad144768d5070758999605b8 100644 (file)
--- a/src/gromacs/mdlib/rbin.cpp
+++ b/src/gromacs/mdlib/rbin.cpp
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2010,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2010,2014,2015,2018,2019,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -86,9 +86,6 @@ int add_binr(t_bin* b, int nr, const real r[])
      /* Copy pointer */
      rbuf = b->rbuf + b->nreal;
  
-#if (defined __ICC && __ICC >= 1500 || defined __ICL && __ICL >= 1500) && defined __MIC__
-#    pragma novector /* Work-around for incorrect vectorization */
-#endif
      for (i = 0; (i < nr); i++)
      {
          rbuf[i] = r[i];
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index 1582c501552437efac4db3b1798cb379f96c9df3..35e675b8473ead1cc7c80d48beacaf8326911c06 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012-2018, The GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -687,12 +687,6 @@ void nbnxn_atomdata_init(const gmx::MDLogger&    mdlog,
      {
          nbat->bUseTreeReduce = (strtol(ptr, nullptr, 10) != 0);
      }
-#if defined __MIC__
-    else if (nth > 8) /*on the CPU we currently don't benefit even at 32*/
-    {
-        nbat->bUseTreeReduce = 1;
-    }
-#endif
      else
      {
          nbat->bUseTreeReduce = false;
diff --git a/src/gromacs/nbnxm/nbnxm_simd.h b/src/gromacs/nbnxm/nbnxm_simd.h

index dec5605f1f8624f0b1655462342ae41270e98193..f29e5553591e1ad835af6a82c8b71fbc18c7e526 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_simd.h
+++ b/src/gromacs/nbnxm/nbnxm_simd.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2014,2015,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -54,7 +54,7 @@
  /*! \brief The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
   * Currently the 2xNN SIMD kernels only make sense with:
   *  8-way SIMD: 4x4 setup, works with AVX-256 in single precision
- * 16-way SIMD: 4x8 setup, works with Intel MIC in single precision
+ * 16-way SIMD: 4x8 setup, not currently in use, but worked with Intel MIC
   */
  #    if GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8
  #        define GMX_NBNXN_SIMD_4XN
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic.h

deleted file mode 100644 (file)

index 5cf004b..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_H
-#define GMX_SIMD_IMPL_X86_MIC_H
-
-#include "impl_x86_mic_definitions.h"
-#include "impl_x86_mic_general.h"
-#include "impl_x86_mic_simd4_double.h"
-#include "impl_x86_mic_simd4_float.h"
-#include "impl_x86_mic_simd_double.h"
-#include "impl_x86_mic_simd_float.h"
-#include "impl_x86_mic_util_double.h"
-#include "impl_x86_mic_util_float.h"
-
-#endif // GMX_SIMD_IMPL_X86_MIC_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_definitions.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_definitions.h

deleted file mode 100644 (file)

index 2f85b0c..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_definitions.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
-#define GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
-
-#define GMX_SIMD 1
-#define GMX_SIMD_HAVE_FLOAT 1
-#define GMX_SIMD_HAVE_DOUBLE 1
-#define GMX_SIMD_HAVE_LOADU 1
-#define GMX_SIMD_HAVE_STOREU 1
-#define GMX_SIMD_HAVE_LOGICAL 1
-#define GMX_SIMD_HAVE_FMA 1
-#define GMX_SIMD_HAVE_FINT32_EXTRACT 1
-#define GMX_SIMD_HAVE_FINT32_LOGICAL 1
-#define GMX_SIMD_HAVE_FINT32_ARITHMETICS 1
-#define GMX_SIMD_HAVE_DINT32_EXTRACT 1
-#define GMX_SIMD_HAVE_DINT32_LOGICAL 1
-#define GMX_SIMD_HAVE_DINT32_ARITHMETICS 1
-#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT 0
-#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT 0
-#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT 0
-#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT 1
-#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT 1
-#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT 1
-#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE 0
-#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0
-#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1
-#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
-#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
-
-#define GMX_SIMD4_HAVE_FLOAT 1
-#define GMX_SIMD4_HAVE_DOUBLE 1
-
-// Implementation details
-#define GMX_SIMD_FLOAT_WIDTH 16
-#define GMX_SIMD_DOUBLE_WIDTH 8
-#define GMX_SIMD_FINT32_WIDTH 16
-#define GMX_SIMD_DINT32_WIDTH 8
-#define GMX_SIMD4_WIDTH 4
-#define GMX_SIMD_ALIGNMENT 64 // Bytes (16*single or 8*double)
-#define GMX_SIMD_RSQRT_BITS 23
-#define GMX_SIMD_RCP_BITS 23
-
-#endif // GMX_SIMD_IMPL_X86_MIC_DEFINITIONS_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_general.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_general.h

deleted file mode 100644 (file)

index 4c2c49f..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_general.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_GENERAL_H
-#define GMX_SIMD_IMPL_X86_MIC_GENERAL_H
-
-#include <immintrin.h>
-
-namespace gmx
-{
-
-static inline void simdPrefetch(const void* m)
-{
-    _mm_prefetch((const char*)m, _MM_HINT_T0);
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_OTHER_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h

deleted file mode 100644 (file)

index 8179a1c..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
-
-#include "config.h"
-
-#include <cassert>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_double.h"
-
-namespace gmx
-{
-
-class Simd4Double
-{
-public:
-    Simd4Double() {}
-
-    Simd4Double(double d) : simdInternal_(_mm512_set1_pd(d)) {}
-
-    // Internal utility constructor to simplify return statements
-    Simd4Double(__m512d simd) : simdInternal_(simd) {}
-
-    __m512d simdInternal_;
-};
-
-class Simd4DBool
-{
-public:
-    Simd4DBool() {}
-
-    // Internal utility constructor to simplify return statements
-    Simd4DBool(__mmask16 simd) : simdInternal_(simd) {}
-
-    __mmask16 simdInternal_;
-};
-
-static inline Simd4Double gmx_simdcall load4(const double* m)
-{
-    assert(size_t(m) % 32 == 0);
-    return { _mm512_mask_extload_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE) };
-}
-
-static inline void gmx_simdcall store4(double* m, Simd4Double a)
-{
-    assert(size_t(m) % 32 == 0);
-    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Double gmx_simdcall load4U(const double* m)
-{
-    return { _mm512_mask_loadunpackhi_pd(
-            _mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m),
-            _mm512_int2mask(0xF),
-            m + 8) };
-}
-
-static inline void gmx_simdcall store4U(double* m, Simd4Double a)
-{
-    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
-    _mm512_mask_packstorehi_pd(m + 8, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Double gmx_simdcall simd4SetZeroD()
-{
-    return { _mm512_setzero_pd() };
-}
-
-static inline Simd4Double gmx_simdcall operator&(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(),
-                                                       _mm512_int2mask(0x00FF),
-                                                       _mm512_castpd_si512(a.simdInternal_),
-                                                       _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall andNot(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
-                                                          _mm512_int2mask(0x00FF),
-                                                          _mm512_castpd_si512(a.simdInternal_),
-                                                          _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall operator|(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(),
-                                                      _mm512_int2mask(0x00FF),
-                                                      _mm512_castpd_si512(a.simdInternal_),
-                                                      _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall operator^(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(),
-                                                       _mm512_int2mask(0x00FF),
-                                                       _mm512_castpd_si512(a.simdInternal_),
-                                                       _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall operator+(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_add_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall operator-(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_sub_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall operator-(Simd4Double x)
-{
-    return { _mm512_mask_addn_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_pd()) };
-}
-
-static inline Simd4Double gmx_simdcall operator*(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_mul_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fma(Simd4Double a, Simd4Double b, Simd4Double c)
-{
-    return { _mm512_mask_fmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fms(Simd4Double a, Simd4Double b, Simd4Double c)
-{
-    return { _mm512_mask_fmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fnma(Simd4Double a, Simd4Double b, Simd4Double c)
-{
-    return { _mm512_mask_fnmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall fnms(Simd4Double a, Simd4Double b, Simd4Double c)
-{
-    return { _mm512_mask_fnmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall rsqrt(Simd4Double x)
-{
-    return { _mm512_mask_cvtpslo_pd(
-            _mm512_undefined_pd(),
-            _mm512_int2mask(0xF),
-            _mm512_mask_rsqrt23_ps(
-                    _mm512_undefined_ps(),
-                    _mm512_int2mask(0xF),
-                    _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_))) };
-}
-
-static inline Simd4Double gmx_simdcall abs(Simd4Double x)
-{
-    return { _mm512_castsi512_pd(
-            _mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
-                                     _mm512_int2mask(0x00FF),
-                                     _mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)),
-                                     _mm512_castpd_si512(x.simdInternal_)))
-
-    };
-}
-
-static inline Simd4Double gmx_simdcall max(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_gmax_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall min(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_gmin_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall round(Simd4Double x)
-{
-    return { _mm512_mask_roundfxpnt_adjust_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline Simd4Double gmx_simdcall trunc(Simd4Double x)
-{
-    return { _mm512_mask_roundfxpnt_adjust_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-static inline double gmx_simdcall dotProduct(Simd4Double a, Simd4Double b)
-{
-    return _mm512_mask_reduce_add_pd(
-            _mm512_int2mask(7),
-            _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7), a.simdInternal_, b.simdInternal_));
-}
-
-static inline void gmx_simdcall transpose(Simd4Double* v0, Simd4Double* v1, Simd4Double* v2, Simd4Double* v3)
-{
-    __m512i t0 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v0->simdInternal_),
-                                                0xFF00,
-                                                _mm512_castpd_si512(v1->simdInternal_),
-                                                _MM_PERM_BABA);
-    __m512i t1 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v2->simdInternal_),
-                                                0xFF00,
-                                                _mm512_castpd_si512(v3->simdInternal_),
-                                                _MM_PERM_BABA);
-
-    t0 = _mm512_permutevar_epi32(
-            _mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t0);
-    t1 = _mm512_permutevar_epi32(
-            _mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t1);
-
-    v0->simdInternal_ = _mm512_mask_swizzle_pd(
-            _mm512_castsi512_pd(t0), _mm512_int2mask(0xCC), _mm512_castsi512_pd(t1), _MM_SWIZ_REG_BADC);
-    v1->simdInternal_ = _mm512_mask_swizzle_pd(
-            _mm512_castsi512_pd(t1), _mm512_int2mask(0x33), _mm512_castsi512_pd(t0), _MM_SWIZ_REG_BADC);
-
-    v2->simdInternal_ =
-            _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v0->simdInternal_), _MM_PERM_DCDC));
-    v3->simdInternal_ =
-            _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v1->simdInternal_), _MM_PERM_DCDC));
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline Simd4DBool gmx_simdcall operator==(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline Simd4DBool gmx_simdcall operator!=(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline Simd4DBool gmx_simdcall operator<(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline Simd4DBool gmx_simdcall operator<=(Simd4Double a, Simd4Double b)
-{
-    return { _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline Simd4DBool gmx_simdcall operator&&(Simd4DBool a, Simd4DBool b)
-{
-    return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4DBool gmx_simdcall operator||(Simd4DBool a, Simd4DBool b)
-{
-    return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(Simd4DBool a)
-{
-    return (_mm512_mask2int(a.simdInternal_) & 0xF) != 0;
-}
-
-static inline Simd4Double gmx_simdcall selectByMask(Simd4Double a, Simd4DBool m)
-{
-    return { _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall selectByNotMask(Simd4Double a, Simd4DBool m)
-{
-    return { _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(m.simdInternal_), a.simdInternal_) };
-}
-
-static inline Simd4Double gmx_simdcall blend(Simd4Double a, Simd4Double b, Simd4DBool sel)
-{
-    return { _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline double gmx_simdcall reduce(Simd4Double a)
-{
-    return _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), a.simdInternal_);
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_float.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_float.h

deleted file mode 100644 (file)

index 644ee6d..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_float.h
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
-
-#include "config.h"
-
-#include <cassert>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_float.h"
-
-namespace gmx
-{
-
-class Simd4Float
-{
-public:
-    Simd4Float() {}
-
-    Simd4Float(float f) : simdInternal_(_mm512_set1_ps(f)) {}
-
-    // Internal utility constructor to simplify return statements
-    Simd4Float(__m512 simd) : simdInternal_(simd) {}
-
-    __m512 simdInternal_;
-};
-
-class Simd4FBool
-{
-public:
-    Simd4FBool() {}
-
-    // Internal utility constructor to simplify return statements
-    Simd4FBool(__mmask16 simd) : simdInternal_(simd) {}
-
-    __mmask16 simdInternal_;
-};
-
-static inline Simd4Float gmx_simdcall load4(const float* m)
-{
-    assert(size_t(m) % 16 == 0);
-    return { _mm512_mask_extload_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE) };
-}
-
-static inline void gmx_simdcall store4(float* m, Simd4Float a)
-{
-    assert(size_t(m) % 16 == 0);
-    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Float gmx_simdcall load4U(const float* m)
-{
-    return { _mm512_mask_loadunpackhi_ps(
-            _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m),
-            _mm512_int2mask(0xF),
-            m + 16) };
-}
-
-static inline void gmx_simdcall store4U(float* m, Simd4Float a)
-{
-    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), a.simdInternal_);
-    _mm512_mask_packstorehi_ps(m + 16, _mm512_int2mask(0xF), a.simdInternal_);
-}
-
-static inline Simd4Float gmx_simdcall simd4SetZeroF()
-{
-    return { _mm512_setzero_ps() };
-}
-
-static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(),
-                                                       _mm512_int2mask(0xF),
-                                                       _mm512_castps_si512(a.simdInternal_),
-                                                       _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
-                                                          _mm512_int2mask(0xF),
-                                                          _mm512_castps_si512(a.simdInternal_),
-                                                          _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(),
-                                                      _mm512_int2mask(0xF),
-                                                      _mm512_castps_si512(a.simdInternal_),
-                                                      _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(),
-                                                       _mm512_int2mask(0xF),
-                                                       _mm512_castps_si512(a.simdInternal_),
-                                                       _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_add_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_sub_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall operator-(Simd4Float x)
-{
-    return { _mm512_mask_addn_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_ps()) };
-}
-
-static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_mul_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
-{
-    return { _mm512_mask_fmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fms(Simd4Float a, Simd4Float b, Simd4Float c)
-{
-    return { _mm512_mask_fmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fnma(Simd4Float a, Simd4Float b, Simd4Float c)
-{
-    return { _mm512_mask_fnmadd_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall fnms(Simd4Float a, Simd4Float b, Simd4Float c)
-{
-    return { _mm512_mask_fnmsub_ps(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall rsqrt(Simd4Float x)
-{
-    return { _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall abs(Simd4Float x)
-{
-    return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(),
-                                                          _mm512_int2mask(0xF),
-                                                          _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
-                                                          _mm512_castps_si512(x.simdInternal_))) };
-}
-
-static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_gmax_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_gmin_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall round(Simd4Float x)
-{
-    return { _mm512_mask_round_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline Simd4Float gmx_simdcall trunc(Simd4Float x)
-{
-    return { _mm512_mask_round_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
-{
-    __m512 x = _mm512_mask_mul_ps(
-            _mm512_setzero_ps(), _mm512_int2mask(0x7), a.simdInternal_, b.simdInternal_);
-    x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
-    x = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
-    float f;
-    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
-    return f;
-}
-
-static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
-{
-    v0->simdInternal_ = _mm512_mask_permute4f128_ps(
-            v0->simdInternal_, _mm512_int2mask(0x00F0), v1->simdInternal_, _MM_PERM_AAAA);
-    v2->simdInternal_ = _mm512_mask_permute4f128_ps(
-            v2->simdInternal_, _mm512_int2mask(0x00F0), v3->simdInternal_, _MM_PERM_AAAA);
-    v0->simdInternal_ = _mm512_mask_permute4f128_ps(
-            v0->simdInternal_, _mm512_int2mask(0xFF00), v2->simdInternal_, _MM_PERM_BABA);
-    v0->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
-            _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0),
-            _mm512_castps_si512(v0->simdInternal_)));
-    v1->simdInternal_ = _mm512_mask_permute4f128_ps(
-            _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_BBBB);
-    v2->simdInternal_ = _mm512_mask_permute4f128_ps(
-            _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_CCCC);
-    v3->simdInternal_ = _mm512_mask_permute4f128_ps(
-            _mm512_setzero_ps(), _mm512_int2mask(0x000F), v0->simdInternal_, _MM_PERM_DDDD);
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline Simd4FBool gmx_simdcall operator==(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline Simd4FBool gmx_simdcall operator!=(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline Simd4FBool gmx_simdcall operator<(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline Simd4FBool gmx_simdcall operator<=(Simd4Float a, Simd4Float b)
-{
-    return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline Simd4FBool gmx_simdcall operator&&(Simd4FBool a, Simd4FBool b)
-{
-    return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline Simd4FBool gmx_simdcall operator||(Simd4FBool a, Simd4FBool b)
-{
-    return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(Simd4FBool a)
-{
-    return (_mm512_mask2int(a.simdInternal_) & 0xF) != 0;
-}
-
-static inline Simd4Float gmx_simdcall selectByMask(Simd4Float a, Simd4FBool m)
-{
-    return { _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall selectByNotMask(Simd4Float a, Simd4FBool m)
-{
-    return { _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(m.simdInternal_), a.simdInternal_) };
-}
-
-static inline Simd4Float gmx_simdcall blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
-{
-    return { _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline float gmx_simdcall reduce(Simd4Float a)
-{
-    __m512 x = a.simdInternal_;
-    x        = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_BADC));
-    x        = _mm512_add_ps(x, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
-    float f;
-    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), x);
-    return f;
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h

deleted file mode 100644 (file)

index 09631b2..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h
+++ /dev/null
@@ -1,560 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/math/utilities.h"
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_float.h"
-
-namespace gmx
-{
-
-class SimdDouble
-{
-public:
-    SimdDouble() {}
-
-    SimdDouble(double d) : simdInternal_(_mm512_set1_pd(d)) {}
-
-    // Internal utility constructor to simplify return statements
-    SimdDouble(__m512d simd) : simdInternal_(simd) {}
-
-    __m512d simdInternal_;
-};
-
-class SimdDInt32
-{
-public:
-    SimdDInt32() {}
-
-    SimdDInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
-
-    // Internal utility constructor to simplify return statements
-    SimdDInt32(__m512i simd) : simdInternal_(simd) {}
-
-    __m512i simdInternal_;
-};
-
-class SimdDBool
-{
-public:
-    SimdDBool() {}
-
-    // Internal utility constructor to simplify return statements
-    SimdDBool(__mmask8 simd) : simdInternal_(simd) {}
-
-    __mmask8 simdInternal_;
-};
-
-class SimdDIBool
-{
-public:
-    SimdDIBool() {}
-
-    // Internal utility constructor to simplify return statements
-    SimdDIBool(__mmask16 simd) : simdInternal_(simd) {}
-
-    __mmask16 simdInternal_;
-};
-
-static inline SimdDouble gmx_simdcall simdLoad(const double* m, SimdDoubleTag = {})
-{
-    assert(std::size_t(m) % 64 == 0);
-    return { _mm512_load_pd(m) };
-}
-
-static inline void gmx_simdcall store(double* m, SimdDouble a)
-{
-    assert(std::size_t(m) % 64 == 0);
-    _mm512_store_pd(m, a.simdInternal_);
-}
-
-static inline SimdDouble gmx_simdcall simdLoadU(const double* m, SimdDoubleTag = {})
-{
-    return { _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m + 8) };
-}
-
-static inline void gmx_simdcall storeU(double* m, SimdDouble a)
-{
-    _mm512_packstorelo_pd(m, a.simdInternal_);
-    _mm512_packstorehi_pd(m + 8, a.simdInternal_);
-}
-
-static inline SimdDouble gmx_simdcall setZeroD()
-{
-    return { _mm512_setzero_pd() };
-}
-
-static inline SimdDInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdDInt32Tag)
-{
-    assert(std::size_t(m) % 32 == 0);
-    return { _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE) };
-}
-
-static inline void gmx_simdcall store(std::int32_t* m, SimdDInt32 a)
-{
-    assert(std::size_t(m) % 32 == 0);
-    _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
-}
-
-static inline SimdDInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdDInt32Tag)
-{
-    return { _mm512_mask_loadunpackhi_epi32(
-            _mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), m),
-            _mm512_int2mask(0x00FF),
-            m + 16) };
-}
-
-static inline void gmx_simdcall storeU(std::int32_t* m, SimdDInt32 a)
-{
-    _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
-    _mm512_mask_packstorehi_epi32(m + 16, _mm512_int2mask(0x00FF), a.simdInternal_);
-}
-
-static inline SimdDInt32 gmx_simdcall setZeroDI()
-{
-    return { _mm512_setzero_epi32() };
-}
-
-template<int index>
-static inline std::int32_t gmx_simdcall extract(SimdDInt32 a)
-{
-    int r;
-    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1 << index), a.simdInternal_);
-    return r;
-}
-
-static inline SimdDouble gmx_simdcall operator&(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a.simdInternal_),
-                                                  _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall andNot(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a.simdInternal_),
-                                                     _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall operator|(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a.simdInternal_),
-                                                 _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall operator^(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a.simdInternal_),
-                                                  _mm512_castpd_si512(b.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall operator+(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_add_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall operator-(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_sub_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall operator-(SimdDouble x)
-{
-    return { _mm512_addn_pd(x.simdInternal_, _mm512_setzero_pd()) };
-}
-
-static inline SimdDouble gmx_simdcall operator*(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_mul_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fma(SimdDouble a, SimdDouble b, SimdDouble c)
-{
-    return { _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fms(SimdDouble a, SimdDouble b, SimdDouble c)
-{
-    return { _mm512_fmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fnma(SimdDouble a, SimdDouble b, SimdDouble c)
-{
-    return { _mm512_fnmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall fnms(SimdDouble a, SimdDouble b, SimdDouble c)
-{
-    return { _mm512_fnmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall rsqrt(SimdDouble x)
-{
-    return { _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall rcp(SimdDouble x)
-{
-    return { _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
-{
-    return { _mm512_mask_add_pd(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
-{
-    return { _mm512_mask_mul_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
-{
-    return { _mm512_mask_mov_pd(_mm512_setzero_pd(),
-                                m.simdInternal_,
-                                _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)) };
-}
-
-static inline SimdDouble gmx_simdcall maskzRsqrt(SimdDouble x, SimdDBool m)
-{
-    return { _mm512_cvtpslo_pd(_mm512_mask_rsqrt23_ps(
-            _mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall maskzRcp(SimdDouble x, SimdDBool m)
-{
-    return { _mm512_cvtpslo_pd(_mm512_mask_rcp23_ps(
-            _mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall abs(SimdDouble x)
-{
-    return { _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)),
-                                                     _mm512_castpd_si512(x.simdInternal_))) };
-}
-
-static inline SimdDouble gmx_simdcall max(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_gmax_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall min(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_gmin_pd(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall round(SimdDouble x)
-{
-    return { _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdDouble gmx_simdcall trunc(SimdDouble x)
-{
-    return { _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdDouble frexp(SimdDouble value, SimdDInt32* exponent)
-{
-    __m512d rExponent;
-    __m512i iExponent;
-    __m512d result;
-
-    if (opt == MathOptimization::Safe)
-    {
-        // For the safe branch, we use the masked operations to only assign results if the
-        // input value was nonzero, and otherwise set exponent to 0, and the fraction to the input (+-0).
-        __mmask8 valueIsNonZero =
-                _mm512_cmp_pd_mask(_mm512_setzero_pd(), value.simdInternal_, _CMP_NEQ_OQ);
-        rExponent = _mm512_mask_getexp_pd(_mm512_setzero_pd(), valueIsNonZero, value.simdInternal_);
-
-        // Create an integer -1 value, and use masking in the conversion as the result for
-        // zero-value input. When we later add 1 to all fields, the fields that were formerly -1
-        // (corresponding to zero exponent) will be assigned -1 + 1 = 0.
-        iExponent = _mm512_mask_cvtfxpnt_roundpd_epi32lo(
-                _mm512_set_epi32(-1), valueIsNonZero, rExponent, _MM_FROUND_TO_NEAREST_INT);
-        iExponent = _mm512__add_epi32(iExponent, _mm512_set1_epi32(1));
-
-        // Set result to value (+-0) when it is zero.
-        result = _mm512_mask_getmant_pd(
-                value.simdInternal_, valueIsNonZero, value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
-    }
-    else
-    {
-        rExponent = _mm512_getexp_pd(value.simdInternal_);
-        iExponent = _mm512_cvtfxpnt_roundpd_epi32lo(rExponent, _MM_FROUND_TO_NEAREST_INT);
-        iExponent = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
-        result    = _mm512_getmant_pd(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
-    }
-
-    exponent->simdInternal_ = iExponent;
-
-    return { result };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdDouble ldexp(SimdDouble value, SimdDInt32 exponent)
-{
-    const __m512i exponentBias = _mm512_set1_epi32(1023);
-    __m512i       iExponent    = _mm512_add_epi32(exponent.simdInternal_, exponentBias);
-
-    if (opt == MathOptimization::Safe)
-    {
-        // Make sure biased argument is not negative
-        iExponent = _mm512_max_epi32(iExponent, _mm512_setzero_epi32());
-    }
-
-    iExponent = _mm512_permutevar_epi32(
-            _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), iExponent);
-    iExponent = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), iExponent, 20);
-    return _mm512_mul_pd(_mm512_castsi512_pd(iExponent), value.simdInternal_);
-}
-
-static inline double gmx_simdcall reduce(SimdDouble a)
-{
-    return _mm512_reduce_add_pd(a.simdInternal_);
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline SimdDBool gmx_simdcall operator==(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline SimdDBool gmx_simdcall operator!=(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline SimdDBool gmx_simdcall operator<(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline SimdDBool gmx_simdcall operator<=(SimdDouble a, SimdDouble b)
-{
-    return { _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline SimdDBool gmx_simdcall testBits(SimdDouble a)
-{
-    // This is a bit problematic since Knight's corner does not have any 64-bit integer comparisons,
-    // and we cannot use floating-point since values with just a single bit set can evaluate to 0.0.
-    // Instead, we do it as
-    // 1) Do a logical or of the high/low 32 bits
-    // 2) Do a permute so we have the low 32 bits of each value in the low 8 32-bit elements
-    // 3) Do an integer comparison, and cast so we just keep the low 8 bits of the mask.
-    //
-    // By default we will use integers for the masks in the nonbonded kernels, so this shouldn't
-    // have any significant performance drawbacks.
-
-    __m512i ia = _mm512_castpd_si512(a.simdInternal_);
-
-    ia = _mm512_or_epi32(ia, _mm512_swizzle_epi32(ia, _MM_SWIZ_REG_CDAB));
-    ia = _mm512_permutevar_epi32(
-            _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), ia);
-
-    return { static_cast<__mmask8>(_mm512_cmp_epi32_mask(ia, _mm512_setzero_si512(), _MM_CMPINT_NE)) };
-}
-
-static inline SimdDBool gmx_simdcall operator&&(SimdDBool a, SimdDBool b)
-{
-    return { static_cast<__mmask8>(_mm512_kand(a.simdInternal_, b.simdInternal_)) };
-}
-
-static inline SimdDBool gmx_simdcall operator||(SimdDBool a, SimdDBool b)
-{
-    return { static_cast<__mmask8>(_mm512_kor(a.simdInternal_, b.simdInternal_)) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdDBool a)
-{
-    return _mm512_mask2int(a.simdInternal_) != 0;
-}
-
-static inline SimdDouble gmx_simdcall selectByMask(SimdDouble a, SimdDBool m)
-{
-    return { _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdDouble gmx_simdcall selectByNotMask(SimdDouble a, SimdDBool m)
-{
-    return { _mm512_mask_mov_pd(a.simdInternal_, m.simdInternal_, _mm512_setzero_pd()) };
-}
-
-static inline SimdDouble gmx_simdcall blend(SimdDouble a, SimdDouble b, SimdDBool sel)
-{
-    return { _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator&(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_and_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall andNot(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator|(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_or_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator^(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_xor_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator+(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_add_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator-(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_sub_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall operator*(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDIBool gmx_simdcall operator==(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ) };
-}
-
-static inline SimdDIBool gmx_simdcall testBits(SimdDInt32 a)
-{
-    return { _mm512_cmp_epi32_mask(a.simdInternal_, _mm512_setzero_si512(), _MM_CMPINT_NE) };
-}
-
-static inline SimdDIBool gmx_simdcall operator<(SimdDInt32 a, SimdDInt32 b)
-{
-    return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT) };
-}
-
-static inline SimdDIBool gmx_simdcall operator&&(SimdDIBool a, SimdDIBool b)
-{
-    return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDIBool gmx_simdcall operator||(SimdDIBool a, SimdDIBool b)
-{
-    return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdDIBool a)
-{
-    return (_mm512_mask2int(a.simdInternal_) & 0xFF) != 0;
-}
-
-static inline SimdDInt32 gmx_simdcall selectByMask(SimdDInt32 a, SimdDIBool m)
-{
-    return { _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall selectByNotMask(SimdDInt32 a, SimdDIBool m)
-{
-    return { _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32()) };
-}
-
-static inline SimdDInt32 gmx_simdcall blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
-{
-    return { _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdDInt32 gmx_simdcall cvtR2I(SimdDouble a)
-{
-    return { _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT) };
-}
-
-static inline SimdDInt32 gmx_simdcall cvttR2I(SimdDouble a)
-{
-    return { _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_ZERO) };
-}
-
-static inline SimdDouble gmx_simdcall cvtI2R(SimdDInt32 a)
-{
-    return { _mm512_cvtepi32lo_pd(a.simdInternal_) };
-}
-
-static inline SimdDIBool gmx_simdcall cvtB2IB(SimdDBool a)
-{
-    return { a.simdInternal_ };
-}
-
-static inline SimdDBool gmx_simdcall cvtIB2B(SimdDIBool a)
-{
-    return { static_cast<__mmask8>(a.simdInternal_) };
-}
-
-static inline void gmx_simdcall cvtF2DD(SimdFloat f, SimdDouble* d0, SimdDouble* d1)
-{
-    __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f.simdInternal_), _MM_PERM_DCDC);
-
-    *d0 = _mm512_cvtpslo_pd(f.simdInternal_);
-    *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
-}
-
-static inline SimdFloat gmx_simdcall cvtDD2F(SimdDouble d0, SimdDouble d1)
-{
-    __m512 f0 = _mm512_cvtpd_pslo(d0.simdInternal_);
-    __m512 f1 = _mm512_cvtpd_pslo(d1.simdInternal_);
-    return { _mm512_mask_permute4f128_ps(f0, _mm512_int2mask(0xFF00), f1, _MM_PERM_BABA) };
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_float.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_float.h

deleted file mode 100644 (file)

index b0603e0..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_float.h
+++ /dev/null
@@ -1,573 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
-#define GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/math/utilities.h"
-
-namespace gmx
-{
-
-class SimdFloat
-{
-public:
-    SimdFloat() {}
-
-    SimdFloat(float f) : simdInternal_(_mm512_set1_ps(f)) {}
-
-    // Internal utility constructor to simplify return statements
-    SimdFloat(__m512 simd) : simdInternal_(simd) {}
-
-    __m512 simdInternal_;
-};
-
-class SimdFInt32
-{
-public:
-    SimdFInt32() {}
-
-    SimdFInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
-
-    // Internal utility constructor to simplify return statements
-    SimdFInt32(__m512i simd) : simdInternal_(simd) {}
-
-    __m512i simdInternal_;
-};
-
-class SimdFBool
-{
-public:
-    SimdFBool() {}
-
-    SimdFBool(bool b) : simdInternal_(_mm512_int2mask(b ? 0xFFFF : 0)) {}
-
-    // Internal utility constructor to simplify return statements
-    SimdFBool(__mmask16 simd) : simdInternal_(simd) {}
-
-    __mmask16 simdInternal_;
-};
-
-class SimdFIBool
-{
-public:
-    SimdFIBool() {}
-
-    SimdFIBool(bool b) : simdInternal_(_mm512_int2mask(b ? 0xFFFF : 0)) {}
-
-    // Internal utility constructor to simplify return statements
-    SimdFIBool(__mmask16 simd) : simdInternal_(simd) {}
-
-    __mmask16 simdInternal_;
-};
-
-static inline SimdFloat gmx_simdcall simdLoad(const float* m, SimdFloatTag = {})
-{
-    assert(std::size_t(m) % 64 == 0);
-    return { _mm512_load_ps(m) };
-}
-
-static inline void gmx_simdcall store(float* m, SimdFloat a)
-{
-    assert(std::size_t(m) % 64 == 0);
-    _mm512_store_ps(m, a.simdInternal_);
-}
-
-static inline SimdFloat gmx_simdcall simdLoadU(const float* m, SimdFloatTag = {})
-{
-    return { _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m + 16) };
-}
-
-static inline void gmx_simdcall storeU(float* m, SimdFloat a)
-{
-    _mm512_packstorelo_ps(m, a.simdInternal_);
-    _mm512_packstorehi_ps(m + 16, a.simdInternal_);
-}
-
-static inline SimdFloat gmx_simdcall setZeroF()
-{
-    return { _mm512_setzero_ps() };
-}
-
-static inline SimdFInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdFInt32Tag)
-{
-    assert(std::size_t(m) % 64 == 0);
-    return { _mm512_load_epi32(m) };
-}
-
-static inline void gmx_simdcall store(std::int32_t* m, SimdFInt32 a)
-{
-    assert(std::size_t(m) % 64 == 0);
-    _mm512_store_epi32(m, a.simdInternal_);
-}
-
-static inline SimdFInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdFInt32Tag)
-{
-    return { _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m + 16) };
-}
-
-static inline void gmx_simdcall storeU(std::int32_t* m, SimdFInt32 a)
-{
-    _mm512_packstorelo_epi32(m, a.simdInternal_);
-    _mm512_packstorehi_epi32(m + 16, a.simdInternal_);
-}
-
-static inline SimdFInt32 gmx_simdcall setZeroFI()
-{
-    return { _mm512_setzero_si512() };
-}
-
-
-template<int index>
-static inline std::int32_t gmx_simdcall extract(SimdFInt32 a)
-{
-    int r;
-    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1 << index), a.simdInternal_);
-    return r;
-}
-
-static inline SimdFloat gmx_simdcall operator&(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a.simdInternal_),
-                                                  _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall andNot(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a.simdInternal_),
-                                                     _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall operator|(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a.simdInternal_),
-                                                 _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall operator^(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a.simdInternal_),
-                                                  _mm512_castps_si512(b.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall operator+(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_add_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall operator-(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_sub_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall operator-(SimdFloat x)
-{
-    return { _mm512_addn_ps(x.simdInternal_, _mm512_setzero_ps()) };
-}
-
-static inline SimdFloat gmx_simdcall operator*(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_mul_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fma(SimdFloat a, SimdFloat b, SimdFloat c)
-{
-    return { _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fms(SimdFloat a, SimdFloat b, SimdFloat c)
-{
-    return { _mm512_fmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fnma(SimdFloat a, SimdFloat b, SimdFloat c)
-{
-    return { _mm512_fnmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall fnms(SimdFloat a, SimdFloat b, SimdFloat c)
-{
-    return { _mm512_fnmsub_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall rsqrt(SimdFloat x)
-{
-    return { _mm512_rsqrt23_ps(x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall rcp(SimdFloat x)
-{
-    return { _mm512_rcp23_ps(x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
-{
-    return { _mm512_mask_add_ps(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
-{
-    return { _mm512_mask_mul_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
-{
-    return { _mm512_mask_mov_ps(_mm512_setzero_ps(),
-                                m.simdInternal_,
-                                _mm512_fmadd_ps(a.simdInternal_, b.simdInternal_, c.simdInternal_)) };
-}
-
-static inline SimdFloat gmx_simdcall maskzRsqrt(SimdFloat x, SimdFBool m)
-{
-    return { _mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall maskzRcp(SimdFloat x, SimdFBool m)
-{
-    return { _mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, x.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall abs(SimdFloat x)
-{
-    return { _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO)),
-                                                     _mm512_castps_si512(x.simdInternal_))) };
-}
-
-static inline SimdFloat gmx_simdcall max(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_gmax_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall min(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_gmin_ps(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall round(SimdFloat x)
-{
-    return { _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFloat gmx_simdcall trunc(SimdFloat x)
-{
-    return { _mm512_round_ps(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall frexp(SimdFloat value, SimdFInt32* exponent)
-{
-    __m512  rExponent;
-    __m512i iExponent;
-    __m512  result;
-
-    if (opt == MathOptimization::Safe)
-    {
-        // For the safe branch, we use the masked operations to only assign results if the
-        // input value was nonzero, and otherwise set exponent to 0, and the fraction to the input (+-0).
-        __mmask16 valueIsNonZero =
-                _mm512_cmp_ps_mask(_mm512_setzero_ps(), value.simdInternal_, _CMP_NEQ_OQ);
-        rExponent = _mm512_mask_getexp_ps(_mm512_setzero_ps(), valueIsNonZero, value.simdInternal_);
-        iExponent = _mm512_cvtfxpnt_round_adjustps_epi32(
-                rExponent, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
-        iExponent = _mm512_mask_add_epi32(iExponent, valueIsNonZero, iExponent, _mm512_set1_epi32(1));
-
-        // Set result to input value when the latter is +-0
-        result = _mm512_mask_getmant_ps(
-                value.simdInternal_, valueIsNonZero, value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
-    }
-    else
-    {
-        rExponent = _mm512_getexp_ps(value.simdInternal_);
-        iExponent = _mm512_cvtfxpnt_round_adjustps_epi32(
-                rExponent, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
-        iExponent = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
-        result    = _mm512_getmant_ps(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
-    }
-
-    exponent->simdInternal_ = iExponent;
-
-    return { result };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall ldexp(SimdFloat value, SimdFInt32 exponent)
-{
-    const __m512i exponentBias = _mm512_set1_epi32(127);
-    __m512i       iExponent    = _mm512_add_epi32(exponent.simdInternal_, exponentBias);
-
-    if (opt == MathOptimization::Safe)
-    {
-        // Make sure biased argument is not negative
-        iExponent = _mm512_max_epi32(iExponent, _mm512_setzero_epi32());
-    }
-
-    iExponent = _mm512_slli_epi32(iExponent, 23);
-
-    return { _mm512_mul_ps(value.simdInternal_, _mm512_castsi512_ps(iExponent)) };
-}
-
-static inline float gmx_simdcall reduce(SimdFloat a)
-{
-    return _mm512_reduce_add_ps(a.simdInternal_);
-}
-
-// Picky, picky, picky:
-// icc-16 complains about "Illegal value of immediate argument to intrinsic"
-// unless we use
-// 1) Ordered-quiet for ==
-// 2) Unordered-quiet for !=
-// 3) Ordered-signaling for < and <=
-
-static inline SimdFBool gmx_simdcall operator==(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ) };
-}
-
-static inline SimdFBool gmx_simdcall operator!=(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ) };
-}
-
-static inline SimdFBool gmx_simdcall operator<(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS) };
-}
-
-static inline SimdFBool gmx_simdcall operator<=(SimdFloat a, SimdFloat b)
-{
-    return { _mm512_cmp_ps_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS) };
-}
-
-static inline SimdFBool gmx_simdcall testBits(SimdFloat a)
-{
-    return { _mm512_test_epi32_mask(_mm512_castps_si512(a.simdInternal_),
-                                    _mm512_castps_si512(a.simdInternal_)) };
-}
-
-static inline SimdFBool gmx_simdcall operator&&(SimdFBool a, SimdFBool b)
-{
-    return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFBool gmx_simdcall operator||(SimdFBool a, SimdFBool b)
-{
-    return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdFBool a)
-{
-    return _mm512_mask2int(a.simdInternal_) != 0;
-}
-
-static inline SimdFloat gmx_simdcall selectByMask(SimdFloat a, SimdFBool m)
-{
-    return { _mm512_mask_mov_ps(_mm512_setzero_ps(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdFloat gmx_simdcall selectByNotMask(SimdFloat a, SimdFBool m)
-{
-    return { _mm512_mask_mov_ps(a.simdInternal_, m.simdInternal_, _mm512_setzero_ps()) };
-}
-
-static inline SimdFloat gmx_simdcall blend(SimdFloat a, SimdFloat b, SimdFBool sel)
-{
-    return { _mm512_mask_blend_ps(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator&(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_and_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall andNot(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator|(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_or_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator^(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_xor_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator+(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_add_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator-(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_sub_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall operator*(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFIBool gmx_simdcall operator==(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ) };
-}
-
-static inline SimdFIBool gmx_simdcall testBits(SimdFInt32 a)
-{
-    return { _mm512_test_epi32_mask(a.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdFIBool gmx_simdcall operator<(SimdFInt32 a, SimdFInt32 b)
-{
-    return { _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT) };
-}
-
-static inline SimdFIBool gmx_simdcall operator&&(SimdFIBool a, SimdFIBool b)
-{
-    return { _mm512_kand(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFIBool gmx_simdcall operator||(SimdFIBool a, SimdFIBool b)
-{
-    return { _mm512_kor(a.simdInternal_, b.simdInternal_) };
-}
-
-static inline bool gmx_simdcall anyTrue(SimdFIBool a)
-{
-    return _mm512_mask2int(a.simdInternal_) != 0;
-}
-
-static inline SimdFInt32 gmx_simdcall selectByMask(SimdFInt32 a, SimdFIBool m)
-{
-    return { _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall selectByNotMask(SimdFInt32 a, SimdFIBool m)
-{
-    return { _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32()) };
-}
-
-static inline SimdFInt32 gmx_simdcall blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
-{
-    return { _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_) };
-}
-
-static inline SimdFInt32 gmx_simdcall cvtR2I(SimdFloat a)
-{
-    return { _mm512_cvtfxpnt_round_adjustps_epi32(
-            a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFInt32 gmx_simdcall cvttR2I(SimdFloat a)
-{
-    return { _mm512_cvtfxpnt_round_adjustps_epi32(a.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFloat gmx_simdcall cvtI2R(SimdFInt32 a)
-{
-    return { _mm512_cvtfxpnt_round_adjustepi32_ps(
-            a.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE) };
-}
-
-static inline SimdFIBool gmx_simdcall cvtB2IB(SimdFBool a)
-{
-    return { a.simdInternal_ };
-}
-
-static inline SimdFBool gmx_simdcall cvtIB2B(SimdFIBool a)
-{
-    return { a.simdInternal_ };
-}
-
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall exp2(SimdFloat x)
-{
-    return { _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(
-            x.simdInternal_, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24)) };
-}
-
-template<MathOptimization opt = MathOptimization::Safe>
-static inline SimdFloat gmx_simdcall exp(SimdFloat x)
-{
-    const __m512 argscale    = _mm512_set1_ps(1.44269504088896341F);
-    const __m512 invargscale = _mm512_set1_ps(-0.69314718055994528623F);
-
-    if (opt == MathOptimization::Safe)
-    {
-        // Set the limit to gurantee flush to zero
-        const SimdFloat smallArgLimit(-88.f);
-        // Since we multiply the argument by 1.44, for the safe version we need to make
-        // sure this doesn't result in overflow
-        x = max(x, smallArgLimit);
-    }
-
-    __m512 xscaled = _mm512_mul_ps(x.simdInternal_, argscale);
-    __m512 r       = _mm512_exp223_ps(
-            _mm512_cvtfxpnt_round_adjustps_epi32(xscaled, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
-
-    // exp2a23_ps provides 23 bits of accuracy, but we ruin some of that with our argument
-    // scaling. To correct this, we find the difference between the scaled argument and
-    // the true one (extended precision arithmetics does not appear to be necessary to
-    // fulfill our accuracy requirements) and then multiply by the exponent of this
-    // correction since exp(a+b)=exp(a)*exp(b).
-    // Note that this only adds two instructions (and maybe some constant loads).
-
-    // find the difference
-    x = _mm512_fmadd_ps(invargscale, xscaled, x.simdInternal_);
-    // x will now be a _very_ small number, so approximate exp(x)=1+x.
-    // We should thus apply the correction as r'=r*(1+x)=r+r*x
-    r = _mm512_fmadd_ps(r, x.simdInternal_, r);
-    return { r };
-}
-
-static inline SimdFloat gmx_simdcall log(SimdFloat x)
-{
-    return { _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764F),
-                           _mm512_log2ae23_ps(x.simdInternal_)) };
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_SIMD_FLOAT_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h

deleted file mode 100644 (file)

index d08086c..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
-#define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_double.h"
-
-namespace gmx
-{
-
-namespace
-{
-/* This is an internal helper function used by decr3Hsimd(...).
- */
-inline void gmx_simdcall decrHsimd(double* m, SimdDouble a)
-{
-    __m512d t;
-
-    assert(std::size_t(m) % 32 == 0);
-
-    t               = _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
-    a.simdInternal_ = _mm512_add_pd(
-            a.simdInternal_,
-            _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a.simdInternal_), _MM_PERM_BADC)));
-    t = _mm512_sub_pd(t, a.simdInternal_);
-    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0x0F), t);
-}
-} // namespace
-
-// On MIC it is better to use scatter operations, so we define the load routines
-// that use a SIMD offset variable first.
-
-template<int align>
-static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
-                                                             SimdDInt32    simdoffset,
-                                                             SimdDouble*   v0,
-                                                             SimdDouble*   v1,
-                                                             SimdDouble*   v2,
-                                                             SimdDouble*   v3)
-{
-    assert((size_t)base % 32 == 0);
-    assert(align % 4 == 0);
-
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    if (align == 4)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-    }
-    else if (align == 8)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-    }
-    else
-    {
-        simdoffset = simdoffset * SimdDInt32(align);
-    }
-
-    v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
-    v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 1, sizeof(double));
-    v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 2, sizeof(double));
-    v3->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 3, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall gatherLoadUBySimdIntTranspose(const double* base,
-                                                              SimdDInt32    simdoffset,
-                                                              SimdDouble*   v0,
-                                                              SimdDouble*   v1)
-{
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    if (align == 2)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 1);
-    }
-    else if (align == 4)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-    }
-    else if (align == 8)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-    }
-    else
-    {
-        simdoffset = simdoffset * SimdDInt32(align);
-    }
-
-    v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
-    v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 1, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
-                                                             SimdDInt32    simdoffset,
-                                                             SimdDouble*   v0,
-                                                             SimdDouble*   v1)
-{
-    assert(std::size_t(base) % 16 == 0);
-    assert(align % 2 == 0);
-    gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
-}
-
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTranspose(const double*      base,
-                                                    const std::int32_t offset[],
-                                                    SimdDouble*        v0,
-                                                    SimdDouble*        v1,
-                                                    SimdDouble*        v2,
-                                                    SimdDouble*        v3)
-{
-    gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1, v2, v3);
-}
-
-template<int align>
-static inline void gmx_simdcall
-                   gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
-{
-    gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdDInt32Tag()), v0, v1);
-}
-
-static const int c_simdBestPairAlignmentDouble = 2;
-
-template<int align>
-static inline void gmx_simdcall gatherLoadUTranspose(const double*      base,
-                                                     const std::int32_t offset[],
-                                                     SimdDouble*        v0,
-                                                     SimdDouble*        v1,
-                                                     SimdDouble*        v2)
-{
-    SimdDInt32 simdoffset;
-
-    assert(std::size_t(offset) % 32 == 0);
-
-    simdoffset = simdLoad(offset, SimdDInt32Tag());
-
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    if (align == 4)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-    }
-    else if (align == 8)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-    }
-    else
-    {
-        simdoffset = simdoffset * SimdDInt32(align);
-    }
-
-    v0->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base, sizeof(double));
-    v1->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 1, sizeof(double));
-    v2->simdInternal_ = _mm512_i32logather_pd(simdoffset.simdInternal_, base + 2, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall transposeScatterStoreU(double*            base,
-                                                       const std::int32_t offset[],
-                                                       SimdDouble         v0,
-                                                       SimdDouble         v1,
-                                                       SimdDouble         v2)
-{
-    SimdDInt32 simdoffset;
-
-    assert(std::size_t(offset) % 32 == 0);
-
-    simdoffset = simdLoad(offset, SimdDInt32Tag());
-
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    if (align == 4)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-    }
-    else if (align == 8)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-    }
-    else
-    {
-        simdoffset = simdoffset * SimdDInt32(align);
-    }
-
-    _mm512_i32loscatter_pd(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(double));
-    _mm512_i32loscatter_pd(base + 1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double));
-    _mm512_i32loscatter_pd(base + 2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double));
-}
-
-template<int align>
-static inline void gmx_simdcall
-                   transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
-{
-    alignas(GMX_SIMD_ALIGNMENT) double rdata0[GMX_SIMD_DOUBLE_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) double rdata1[GMX_SIMD_DOUBLE_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) double rdata2[GMX_SIMD_DOUBLE_WIDTH];
-
-    store(rdata0, v0);
-    store(rdata1, v1);
-    store(rdata2, v2);
-
-    for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
-    {
-        base[align * offset[i] + 0] += rdata0[i];
-        base[align * offset[i] + 1] += rdata1[i];
-        base[align * offset[i] + 2] += rdata2[i];
-    }
-}
-
-template<int align>
-static inline void gmx_simdcall
-                   transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
-{
-    alignas(GMX_SIMD_ALIGNMENT) double rdata0[GMX_SIMD_DOUBLE_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) double rdata1[GMX_SIMD_DOUBLE_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) double rdata2[GMX_SIMD_DOUBLE_WIDTH];
-
-    store(rdata0, v0);
-    store(rdata1, v1);
-    store(rdata2, v2);
-
-    for (int i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
-    {
-        base[align * offset[i] + 0] -= rdata0[i];
-        base[align * offset[i] + 1] -= rdata1[i];
-        base[align * offset[i] + 2] -= rdata2[i];
-    }
-}
-
-static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble  scalar,
-                                                        SimdDouble* triplets0,
-                                                        SimdDouble* triplets1,
-                                                        SimdDouble* triplets2)
-{
-    triplets0->simdInternal_ = _mm512_castsi512_pd(
-            _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
-                                    _mm512_castpd_si512(scalar.simdInternal_)));
-    triplets1->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(
-            _mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
-            _mm512_castpd_si512(scalar.simdInternal_)));
-    triplets2->simdInternal_ = _mm512_castsi512_pd(_mm512_permutevar_epi32(
-            _mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
-            _mm512_castpd_si512(scalar.simdInternal_)));
-}
-
-
-static inline double gmx_simdcall
-                     reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
-{
-    double  d;
-    __m512d t0, t1, t2, t3;
-
-    assert(std::size_t(m) % 32 == 0);
-
-    t0 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0.simdInternal_, v2.simdInternal_),
-                           _MM_SWIZ_REG_BADC);
-    t2 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2.simdInternal_, v0.simdInternal_);
-    t1 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1.simdInternal_, v3.simdInternal_),
-                           _MM_SWIZ_REG_BADC);
-    t3 = _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3.simdInternal_, v1.simdInternal_);
-    t0 = _mm512_add_pd(t0, t2);
-    t1 = _mm512_add_pd(t1, t3);
-
-    t2 = _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0, t1), _MM_SWIZ_REG_CDAB);
-    t3 = _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1, t0);
-    t2 = _mm512_add_pd(t2, t3);
-
-    t2 = _mm512_add_pd(t2, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2), _MM_PERM_BADC)));
-
-    t0 = _mm512_mask_extload_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
-    t0 = _mm512_add_pd(t0, t2);
-    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t0);
-
-    t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_BADC));
-    t2 = _mm512_add_pd(t2, _mm512_swizzle_pd(t2, _MM_SWIZ_REG_CDAB));
-
-    _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x01), t2);
-    return d;
-}
-
-static inline SimdDouble gmx_simdcall loadDualHsimd(const double* m0, const double* m1)
-{
-    assert(std::size_t(m0) % 32 == 0);
-    assert(std::size_t(m1) % 32 == 0);
-
-    return _mm512_mask_extload_pd(_mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
-                                  _mm512_int2mask(0xF0),
-                                  m1,
-                                  _MM_UPCONV_PD_NONE,
-                                  _MM_BROADCAST_4X8,
-                                  _MM_HINT_NONE);
-}
-
-static inline SimdDouble gmx_simdcall loadDuplicateHsimd(const double* m)
-{
-    assert(std::size_t(m) % 32 == 0);
-
-    return _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
-}
-
-static inline SimdDouble gmx_simdcall loadU1DualHsimd(const double* m)
-{
-    return _mm512_mask_extload_pd(_mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE),
-                                  _mm512_int2mask(0xF0),
-                                  m + 1,
-                                  _MM_UPCONV_PD_NONE,
-                                  _MM_BROADCAST_1X8,
-                                  _MM_HINT_NONE);
-}
-
-
-static inline void gmx_simdcall storeDualHsimd(double* m0, double* m1, SimdDouble a)
-{
-    assert(std::size_t(m0) % 32 == 0);
-    assert(std::size_t(m1) % 32 == 0);
-
-    _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), a.simdInternal_);
-    _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), a.simdInternal_);
-}
-
-static inline void gmx_simdcall incrDualHsimd(double* m0, double* m1, SimdDouble a)
-{
-    assert(std::size_t(m0) % 32 == 0);
-    assert(std::size_t(m1) % 32 == 0);
-
-    __m512d x;
-
-    // Update lower half
-    x = _mm512_extload_pd(m0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
-    x = _mm512_add_pd(x, a.simdInternal_);
-    _mm512_mask_packstorelo_pd(m0, _mm512_int2mask(0x0F), x);
-
-    // Update upper half
-    x = _mm512_extload_pd(m1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
-    x = _mm512_add_pd(x, a.simdInternal_);
-    _mm512_mask_packstorelo_pd(m1, _mm512_int2mask(0xF0), x);
-}
-
-static inline void gmx_simdcall decr3Hsimd(double* m, SimdDouble a0, SimdDouble a1, SimdDouble a2)
-{
-    assert(std::size_t(m) % 32 == 0);
-    decrHsimd(m, a0);
-    decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH / 2, a1);
-    decrHsimd(m + GMX_SIMD_DOUBLE_WIDTH, a2);
-}
-
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTransposeHsimd(const double*      base0,
-                                                         const double*      base1,
-                                                         const std::int32_t offset[],
-                                                         SimdDouble*        v0,
-                                                         SimdDouble*        v1)
-{
-    __m512i idx0, idx1, idx;
-    __m512d tmp1, tmp2;
-
-    assert(std::size_t(offset) % 16 == 0);
-    assert(std::size_t(base0) % 16 == 0);
-    assert(std::size_t(base1) % 16 == 0);
-    assert(std::size_t(align) % 2 == 0);
-
-    idx0 = _mm512_extload_epi32(offset, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
-
-    idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
-    idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
-
-    idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0x00F0), idx1, _MM_PERM_AAAA);
-
-    tmp1 = _mm512_i32logather_pd(idx, base0, sizeof(double));
-    tmp2 = _mm512_i32logather_pd(idx, base1, sizeof(double));
-
-    v0->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(
-            _mm512_castpd_ps(tmp1), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2), _MM_PERM_BABA));
-    v1->simdInternal_ = _mm512_castps_pd(_mm512_mask_permute4f128_ps(
-            _mm512_castpd_ps(tmp2), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1), _MM_PERM_DCDC));
-}
-
-static inline double gmx_simdcall reduceIncr4ReturnSumHsimd(double* m, SimdDouble v0, SimdDouble v1)
-{
-    double  d;
-    __m512d t0, t1;
-
-    assert(std::size_t(m) % 32 == 0);
-
-    t0 = _mm512_add_pd(v0.simdInternal_, _mm512_swizzle_pd(v0.simdInternal_, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_mask_add_pd(t0,
-                            _mm512_int2mask(0xCC),
-                            v1.simdInternal_,
-                            _mm512_swizzle_pd(v1.simdInternal_, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
-    t0 = _mm512_castps_pd(_mm512_mask_permute4f128_ps(
-            _mm512_castpd_ps(t0), _mm512_int2mask(0xCCCC), _mm512_castpd_ps(t0), _MM_PERM_DCDC));
-
-    t1 = _mm512_mask_extload_pd(
-            _mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE);
-    t1 = _mm512_add_pd(t1, t0);
-    _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), t1);
-
-    t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_add_pd(t0, _mm512_swizzle_pd(t0, _MM_SWIZ_REG_CDAB));
-
-    _mm512_mask_packstorelo_pd(&d, _mm512_mask2int(0x03), t0);
-    return d;
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h b/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h

deleted file mode 100644 (file)

index bf5db5b..0000000
--- a/src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
-#define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
-
-#include "config.h"
-
-#include <cassert>
-#include <cstdint>
-
-#include <immintrin.h>
-
-#include "gromacs/utility/basedefinitions.h"
-
-#include "impl_x86_mic_simd_float.h"
-
-namespace gmx
-{
-
-namespace
-{
-/* This is an internal helper function used by decr3Hsimd(...).
- */
-inline void gmx_simdcall decrHsimd(float* m, SimdFloat a)
-{
-    __m512 t;
-
-    assert(std::size_t(m) % 32 == 0);
-
-    t = _mm512_castpd_ps(_mm512_extload_pd(
-            reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
-    a = _mm512_add_ps(a.simdInternal_, _mm512_permute4f128_ps(a.simdInternal_, _MM_PERM_BADC));
-    t = _mm512_sub_ps(t, a.simdInternal_);
-    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0x00FF), t);
-}
-} // namespace
-
-// On MIC it is better to use scatter operations, so we define the load routines
-// that use a SIMD offset variable first.
-
-template<int align>
-static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
-                                                             SimdFInt32   simdoffset,
-                                                             SimdFloat*   v0,
-                                                             SimdFloat*   v1,
-                                                             SimdFloat*   v2,
-                                                             SimdFloat*   v3)
-{
-    assert(std::size_t(base) % 16 == 0);
-    assert(align % 4 == 0);
-
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    if (align == 4)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-    }
-    else if (align == 8)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-    }
-    else
-    {
-        simdoffset = simdoffset * SimdFInt32(align);
-    }
-
-    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
-    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
-    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
-    v3->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 3, sizeof(float));
-}
-
-template<int align>
-static inline void gmx_simdcall
-                   gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
-{
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    // For align == 2 we can merge the constant into the scale parameter,
-    // which can take constants up to 8 in total.
-    if (align == 2)
-    {
-        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, align * sizeof(float));
-        v1->simdInternal_ =
-                _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, align * sizeof(float));
-    }
-    else
-    {
-        if (align == 4)
-        {
-            simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-        }
-        else if (align == 8)
-        {
-            simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-        }
-        else
-        {
-            simdoffset = simdoffset * SimdFInt32(align);
-        }
-        v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
-        v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
-    }
-}
-
-template<int align>
-static inline void gmx_simdcall
-                   gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 simdoffset, SimdFloat* v0, SimdFloat* v1)
-{
-    assert(std::size_t(base) % 8 == 0);
-    assert(align % 2 == 0);
-    gatherLoadUBySimdIntTranspose<align>(base, simdoffset, v0, v1);
-}
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
-                                                    const std::int32_t offset[],
-                                                    SimdFloat*         v0,
-                                                    SimdFloat*         v1,
-                                                    SimdFloat*         v2,
-                                                    SimdFloat*         v3)
-{
-    gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1, v2, v3);
-}
-
-template<int align>
-static inline void gmx_simdcall
-                   gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
-{
-    gatherLoadBySimdIntTranspose<align>(base, simdLoad(offset, SimdFInt32Tag()), v0, v1);
-}
-
-static const int c_simdBestPairAlignmentFloat = 2;
-
-template<int align>
-static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
-                                                     const std::int32_t offset[],
-                                                     SimdFloat*         v0,
-                                                     SimdFloat*         v1,
-                                                     SimdFloat*         v2)
-{
-    SimdFInt32 simdoffset;
-
-    assert(std::size_t(offset) % 64 == 0);
-
-    simdoffset = simdLoad(offset, SimdFInt32Tag());
-
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    if (align == 4)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-    }
-    else if (align == 8)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-    }
-    else
-    {
-        simdoffset = simdoffset * SimdFInt32(align);
-    }
-
-    v0->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base, sizeof(float));
-    v1->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 1, sizeof(float));
-    v2->simdInternal_ = _mm512_i32gather_ps(simdoffset.simdInternal_, base + 2, sizeof(float));
-}
-
-
-template<int align>
-static inline void gmx_simdcall
-                   transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
-{
-    SimdFInt32 simdoffset;
-
-    assert(std::size_t(offset) % 64 == 0);
-
-    simdoffset = simdLoad(offset, SimdFInt32Tag());
-
-    // All instructions might be latency ~4 on MIC, so we use shifts where we
-    // only need a single instruction (since the shift parameter is an immediate),
-    // but multiplication otherwise.
-    if (align == 4)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 2);
-    }
-    else if (align == 8)
-    {
-        simdoffset.simdInternal_ = _mm512_slli_epi32(simdoffset.simdInternal_, 3);
-    }
-    else
-    {
-        simdoffset = simdoffset * SimdFInt32(align);
-    }
-
-    _mm512_i32scatter_ps(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(float));
-    _mm512_i32scatter_ps(base + 1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float));
-    _mm512_i32scatter_ps(base + 2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float));
-}
-
-
-template<int align>
-static inline void gmx_simdcall
-                   transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
-{
-    alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
-
-    store(rdata0, v0);
-    store(rdata1, v1);
-    store(rdata2, v2);
-
-    for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
-    {
-        base[align * offset[i] + 0] += rdata0[i];
-        base[align * offset[i] + 1] += rdata1[i];
-        base[align * offset[i] + 2] += rdata2[i];
-    }
-}
-
-template<int align>
-static inline void gmx_simdcall
-                   transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
-{
-    alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
-    alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
-
-    store(rdata0, v0);
-    store(rdata1, v1);
-    store(rdata2, v2);
-
-    for (int i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
-    {
-        base[align * offset[i] + 0] -= rdata0[i];
-        base[align * offset[i] + 1] -= rdata1[i];
-        base[align * offset[i] + 2] -= rdata2[i];
-    }
-}
-
-static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
-                                                        SimdFloat* triplets0,
-                                                        SimdFloat* triplets1,
-                                                        SimdFloat* triplets2)
-{
-    triplets0->simdInternal_ = _mm512_castsi512_ps(
-            _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
-                                    _mm512_castps_si512(scalar.simdInternal_)));
-    triplets1->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
-            _mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
-            _mm512_castps_si512(scalar.simdInternal_)));
-    triplets2->simdInternal_ = _mm512_castsi512_ps(_mm512_permutevar_epi32(
-            _mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
-            _mm512_castps_si512(scalar.simdInternal_)));
-}
-
-
-static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
-{
-    float  f;
-    __m512 t0, t1, t2, t3;
-
-    assert(std::size_t(m) % 16 == 0);
-
-    t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_mask_add_ps(t0,
-                            _mm512_int2mask(0xCCCC),
-                            v2.simdInternal_,
-                            _mm512_swizzle_ps(v2.simdInternal_, _MM_SWIZ_REG_BADC));
-    t1 = _mm512_add_ps(v1.simdInternal_, _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
-    t1 = _mm512_mask_add_ps(t1,
-                            _mm512_int2mask(0xCCCC),
-                            v3.simdInternal_,
-                            _mm512_swizzle_ps(v3.simdInternal_, _MM_SWIZ_REG_BADC));
-    t2 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
-    t2 = _mm512_mask_add_ps(t2, _mm512_int2mask(0xAAAA), t1, _mm512_swizzle_ps(t1, _MM_SWIZ_REG_CDAB));
-
-    t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_BADC));
-    t2 = _mm512_add_ps(t2, _mm512_permute4f128_ps(t2, _MM_PERM_CDAB));
-
-    t0 = _mm512_mask_extload_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
-    t0 = _mm512_add_ps(t0, t2);
-    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t0);
-
-    t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_BADC));
-    t2 = _mm512_add_ps(t2, _mm512_swizzle_ps(t2, _MM_SWIZ_REG_CDAB));
-
-    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t2);
-    return f;
-}
-
-static inline SimdFloat gmx_simdcall loadDualHsimd(const float* m0, const float* m1)
-{
-    assert(std::size_t(m0) % 32 == 0);
-    assert(std::size_t(m1) % 32 == 0);
-
-    return _mm512_castpd_ps(_mm512_mask_extload_pd(
-            _mm512_extload_pd(reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE),
-            _mm512_int2mask(0xF0),
-            reinterpret_cast<const double*>(m1),
-            _MM_UPCONV_PD_NONE,
-            _MM_BROADCAST_4X8,
-            _MM_HINT_NONE));
-}
-
-static inline SimdFloat gmx_simdcall loadDuplicateHsimd(const float* m)
-{
-    assert(std::size_t(m) % 32 == 0);
-
-    return _mm512_castpd_ps(_mm512_extload_pd(
-            reinterpret_cast<const double*>(m), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
-}
-
-static inline SimdFloat gmx_simdcall loadU1DualHsimd(const float* m)
-{
-    return _mm512_mask_extload_ps(_mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE),
-                                  _mm512_int2mask(0xFF00),
-                                  m + 1,
-                                  _MM_UPCONV_PS_NONE,
-                                  _MM_BROADCAST_1X16,
-                                  _MM_HINT_NONE);
-}
-
-
-static inline void gmx_simdcall storeDualHsimd(float* m0, float* m1, SimdFloat a)
-{
-    __m512 t0;
-
-    assert(std::size_t(m0) % 32 == 0);
-    assert(std::size_t(m1) % 32 == 0);
-
-    _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), a.simdInternal_);
-    _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), a.simdInternal_);
-}
-
-static inline void gmx_simdcall incrDualHsimd(float* m0, float* m1, SimdFloat a)
-{
-    assert(std::size_t(m0) % 32 == 0);
-    assert(std::size_t(m1) % 32 == 0);
-
-    __m512 x;
-
-    // Update lower half
-    x = _mm512_castpd_ps(_mm512_extload_pd(
-            reinterpret_cast<const double*>(m0), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
-    x = _mm512_add_ps(x, a.simdInternal_);
-    _mm512_mask_packstorelo_ps(m0, _mm512_int2mask(0x00FF), x);
-
-    // Update upper half
-    x = _mm512_castpd_ps(_mm512_extload_pd(
-            reinterpret_cast<const double*>(m1), _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE));
-    x = _mm512_add_ps(x, a.simdInternal_);
-    _mm512_mask_packstorelo_ps(m1, _mm512_int2mask(0xFF00), x);
-}
-
-static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
-{
-    assert(std::size_t(m) % 32 == 0);
-    decrHsimd(m, a0);
-    decrHsimd(m + GMX_SIMD_FLOAT_WIDTH / 2, a1);
-    decrHsimd(m + GMX_SIMD_FLOAT_WIDTH, a2);
-}
-
-
-template<int align>
-static inline void gmx_simdcall gatherLoadTransposeHsimd(const float*       base0,
-                                                         const float*       base1,
-                                                         const std::int32_t offset[],
-                                                         SimdFloat*         v0,
-                                                         SimdFloat*         v1)
-{
-    __m512i idx0, idx1, idx;
-    __m512  tmp1, tmp2;
-
-    assert(std::size_t(offset) % 32 == 0);
-    assert(std::size_t(base0) % 8 == 0);
-    assert(std::size_t(base1) % 8 == 0);
-    assert(std::size_t(align) % 2 == 0);
-
-    idx0 = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset);
-
-    idx0 = _mm512_mullo_epi32(idx0, _mm512_set1_epi32(align));
-    idx1 = _mm512_add_epi32(idx0, _mm512_set1_epi32(1));
-
-    idx = _mm512_mask_permute4f128_epi32(idx0, _mm512_int2mask(0xFF00), idx1, _MM_PERM_BABA);
-
-    tmp1 = _mm512_i32gather_ps(idx, base0, sizeof(float));
-    tmp2 = _mm512_i32gather_ps(idx, base1, sizeof(float));
-
-    v0->simdInternal_ = _mm512_mask_permute4f128_ps(tmp1, _mm512_int2mask(0xFF00), tmp2, _MM_PERM_BABA);
-    v1->simdInternal_ = _mm512_mask_permute4f128_ps(tmp2, _mm512_int2mask(0x00FF), tmp1, _MM_PERM_DCDC);
-}
-
-static inline float gmx_simdcall reduceIncr4ReturnSumHsimd(float* m, SimdFloat v0, SimdFloat v1)
-{
-    float  f;
-    __m512 t0, t1;
-
-    assert(std::size_t(m) % 32 == 0);
-
-    t0 = _mm512_add_ps(v0.simdInternal_, _mm512_swizzle_ps(v0.simdInternal_, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_mask_add_ps(t0,
-                            _mm512_int2mask(0xCCCC),
-                            v1.simdInternal_,
-                            _mm512_swizzle_ps(v1.simdInternal_, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
-    t0 = _mm512_add_ps(t0, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0), _MM_SWIZ_REG_BADC)));
-    t0 = _mm512_mask_permute4f128_ps(t0, _mm512_int2mask(0xAAAA), t0, _MM_PERM_BADC);
-    t1 = _mm512_mask_extload_ps(
-            _mm512_undefined_ps(), _mm512_int2mask(0xF), m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE);
-    t1 = _mm512_add_ps(t1, t0);
-    _mm512_mask_packstorelo_ps(m, _mm512_int2mask(0xF), t1);
-
-    t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_BADC));
-    t0 = _mm512_add_ps(t0, _mm512_swizzle_ps(t0, _MM_SWIZ_REG_CDAB));
-
-    _mm512_mask_packstorelo_ps(&f, _mm512_mask2int(0x1), t0);
-    return f;
-}
-
-} // namespace gmx
-
-#endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h

index 3e562079f4821af8a2d243fb05f7041f37de8262..3b78e3fbf3dbc5da4b8ed41357ad64035a7a4575 100644 (file)
--- a/src/gromacs/simd/simd.h
+++ b/src/gromacs/simd/simd.h
@@ -142,8 +142,6 @@ struct SimdDInt32Tag
  #    include "impl_x86_avx2_256/impl_x86_avx2_256.h"
  #elif GMX_SIMD_X86_AVX2_128
  #    include "impl_x86_avx2_128/impl_x86_avx2_128.h"
-#elif GMX_SIMD_X86_MIC
-#    include "impl_x86_mic/impl_x86_mic.h"
  #elif GMX_SIMD_X86_AVX_512
  #    include "impl_x86_avx_512/impl_x86_avx_512.h"
  #elif GMX_SIMD_X86_AVX_512_KNL
diff --git a/src/gromacs/simd/support.cpp b/src/gromacs/simd/support.cpp

index ba14076f060d778391afb0af6438271114fcd110..bff9f83892158715a2874a64028fcdcc8ad1678b 100644 (file)
--- a/src/gromacs/simd/support.cpp
+++ b/src/gromacs/simd/support.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2015,2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -82,7 +82,6 @@ const std::string& simdString(SimdType s)
          { SimdType::X86_Avx2_128, "AVX2_128" },
          { SimdType::X86_Avx512, "AVX_512" },
          { SimdType::X86_Avx512Knl, "AVX_512_KNL" },
-        { SimdType::X86_Mic, "X86_MIC" },
          { SimdType::Arm_Neon, "ARM_NEON" },
          { SimdType::Arm_NeonAsimd, "ARM_NEON_ASIMD" },
          { SimdType::Arm_Sve, "ARM_SVE" },
@@ -204,8 +203,6 @@ SimdType simdCompiled()
      return SimdType::X86_Avx512Knl;
  #elif GMX_SIMD_X86_AVX_512
      return SimdType::X86_Avx512;
-#elif GMX_SIMD_X86_MIC
-    return SimdType::X86_Mic;
  #elif GMX_SIMD_X86_AVX2_256
      return SimdType::X86_Avx2;
  #elif GMX_SIMD_X86_AVX2_128
diff --git a/src/gromacs/simd/support.h b/src/gromacs/simd/support.h

index 2e108126749e06f8c8efc3f78a7cef4f7980f748..2d4747bc264d731f9dbb6647a2174e45a45f8f2b 100644 (file)
--- a/src/gromacs/simd/support.h
+++ b/src/gromacs/simd/support.h
@@ -1,7 +1,8 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -67,7 +68,6 @@ enum class SimdType
      X86_Avx2_128,  //!< 128-bit AVX2, better than 256-bit for AMD Ryzen
      X86_Avx512,    //!< AVX_512
      X86_Avx512Knl, //!< AVX_512_KNL
-    X86_Mic,       //!< Knight's corner
      Arm_Neon,      //!< 32-bit ARM NEON
      Arm_NeonAsimd, //!< 64-bit ARM AArch64 Advanced SIMD
      Arm_Sve,       //!< ARM Scalable Vector Extensions
diff --git a/src/gromacs/utility/gmxomp.h b/src/gromacs/utility/gmxomp.h

index b4a6f146190d562874f939defebdf385ace24faa..5cf18de8ac60bc19c1ffd9e171b6e3cca8f605bd 100644 (file)
--- a/src/gromacs/utility/gmxomp.h
+++ b/src/gromacs/utility/gmxomp.h
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -124,8 +124,6 @@ static inline void gmx_pause()
      YieldProcessor();
  #elif HAVE_XMMINTRIN_H
      _mm_pause();
-#elif defined __MIC__
-    _mm_delay_32(32);
  #else
      // No wait for unknown architecture
  #endif
author	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 1 Feb 2021 12:16:44 +0000 (12:16 +0000)
committer	Joe Jordan <ejjordan12@gmail.com>
	Mon, 1 Feb 2021 12:16:44 +0000 (12:16 +0000)
CMakeLists.txt		patch \| blob \| history
cmake/TestMIC.cpp	[deleted file]	patch \| blob \| history
cmake/gmxDetectSimd.cmake		patch \| blob \| history
cmake/gmxDetectTargetArchitecture.cmake		patch \| blob \| history
cmake/gmxManageSimd.cmake		patch \| blob \| history
docs/doxygen/lib/simd.md		patch \| blob \| history
docs/release-notes/2022/major/removed-functionality.rst		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/external/thread_mpi/include/thread_mpi/atomic/gcc_x86.h		patch \| blob \| history
src/gromacs/mdlib/rbin.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_simd.h		patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_definitions.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_general.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_float.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_float.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_double.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_x86_mic/impl_x86_mic_util_float.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/simd.h		patch \| blob \| history
src/gromacs/simd/support.cpp		patch \| blob \| history
src/gromacs/simd/support.h		patch \| blob \| history
src/gromacs/utility/gmxomp.h		patch \| blob \| history